pdf2swf/xpdf/Lexer.cc

   1 //========================================================================
   2 //
   3 // Lexer.cc
   4 //
   5 // Copyright 1996-2003 Glyph & Cog, LLC
   6 //
   7 //========================================================================
   8
   9 #include <aconf.h>
  10
  11 #ifdef USE_GCC_PRAGMAS
  12 #pragma implementation
  13 #endif
  14
  15 #include <stdlib.h>
  16 #include <stddef.h>
  17 #include <string.h>
  18 #include <ctype.h>
  19 #include "Lexer.h"
  20 #include "Error.h"
  21
  22 //------------------------------------------------------------------------
  23
  24 // A '1' in this array means the character is white space.  A '1' or
  25 // '2' means the character ends a name or command.
  26 static char specialChars[256] = {
  27   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  28   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  29   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  30   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  31   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  32   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  33   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  34   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  35   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  36   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  37   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  38   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  41   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  42   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
  43 };
  44
  45 //------------------------------------------------------------------------
  46 // Lexer
  47 //------------------------------------------------------------------------
  48
  49 Lexer::Lexer(XRef *xref, Stream *str) {
  50   Object obj;
  51
  52   curStr.initStream(str);
  53   streams = new Array(xref);
  54   streams->add(curStr.copy(&obj));
  55   strPtr = 0;
  56   freeArray = gTrue;
  57   curStr.streamReset();
  58 }
  59
  60 Lexer::Lexer(XRef *xref, Object *obj) {
  61   Object obj2;
  62
  63   if (obj->isStream()) {
  64     streams = new Array(xref);
  65     freeArray = gTrue;
  66     streams->add(obj->copy(&obj2));
  67   } else {
  68     streams = obj->getArray();
  69     freeArray = gFalse;
  70   }
  71   strPtr = 0;
  72   if (streams->getLength() > 0) {
  73     streams->get(strPtr, &curStr);
  74     curStr.streamReset();
  75   }
  76 }
  77 static int illegalChars = 0;
  78
  79 Lexer::~Lexer() {
  80   if (!curStr.isNone()) {
  81     curStr.streamClose();
  82     curStr.free();
  83   }
  84   if (freeArray) {
  85     delete streams;
  86   }
  87   if(illegalChars)
  88       error(0, "Illegal characters in hex string (%d)", illegalChars);
  89   illegalChars = 0;
  90 }
  91
  92 int Lexer::getChar() {
  93   int c;
  94
  95   c = EOF;
  96   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
  97     curStr.streamClose();
  98     curStr.free();
  99     ++strPtr;
 100     if (strPtr < streams->getLength()) {
 101       streams->get(strPtr, &curStr);
 102       curStr.streamReset();
 103     }
 104   }
 105   return c;
 106 }
 107
 108 int Lexer::lookChar() {
 109   if (curStr.isNone()) {
 110     return EOF;
 111   }
 112   return curStr.streamLookChar();
 113 }
 114
 115 Object *Lexer::getObj(Object *obj) {
 116   char *p;
 117   int c, c2;
 118   GBool comment, neg, done;
 119   int numParen;
 120   int xi;
 121   double xf, scale;
 122   GString *s;
 123   int n, m;
 124
 125   // skip whitespace and comments
 126   comment = gFalse;
 127   while (1) {
 128     if ((c = getChar()) == EOF) {
 129       return obj->initEOF();
 130     }
 131     if (comment) {
 132       if (c == '\r' || c == '\n')
 133         comment = gFalse;
 134     } else if (c == '%') {
 135       comment = gTrue;
 136     } else if (specialChars[c] != 1) {
 137       break;
 138     }
 139   }
 140
 141   // start reading token
 142   switch (c) {
 143
 144   // number
 145   case '0': case '1': case '2': case '3': case '4':
 146   case '5': case '6': case '7': case '8': case '9':
 147   case '-': case '.':
 148     neg = gFalse;
 149     xi = 0;
 150     if (c == '-') {
 151       neg = gTrue;
 152     } else if (c == '.') {
 153       goto doReal;
 154     } else {
 155       xi = c - '0';
 156     }
 157     while (1) {
 158       c = lookChar();
 159       if (isdigit(c)) {
 160         getChar();
 161         xi = xi * 10 + (c - '0');
 162       } else if (c == '.') {
 163         getChar();
 164         goto doReal;
 165       } else {
 166         break;
 167       }
 168     }
 169     if (neg)
 170       xi = -xi;
 171     obj->initInt(xi);
 172     break;
 173   doReal:
 174     xf = xi;
 175     scale = 0.1;
 176     while (1) {
 177       c = lookChar();
 178       if (c == '-') {
 179         // ignore minus signs in the middle of numbers to match
 180         // Adobe's behavior
 181         error(getPos(), "Badly formatted number");
 182         getChar();
 183         continue;
 184       }
 185       if (!isdigit(c)) {
 186         break;
 187       }
 188       getChar();
 189       xf = xf + scale * (c - '0');
 190       scale *= 0.1;
 191     }
 192     if (neg)
 193       xf = -xf;
 194     obj->initReal(xf);
 195     break;
 196
 197   // string
 198   case '(':
 199     p = tokBuf;
 200     n = 0;
 201     numParen = 1;
 202     done = gFalse;
 203     s = NULL;
 204     do {
 205       c2 = EOF;
 206       switch (c = getChar()) {
 207
 208       case EOF:
 209 #if 0
 210       // This breaks some PDF files, e.g., ones from Photoshop.
 211       case '\r':
 212       case '\n':
 213 #endif
 214         error(getPos(), "Unterminated string");
 215         done = gTrue;
 216         break;
 217
 218       case '(':
 219         ++numParen;
 220         c2 = c;
 221         break;
 222
 223       case ')':
 224         if (--numParen == 0) {
 225           done = gTrue;
 226         } else {
 227           c2 = c;
 228         }
 229         break;
 230
 231       case '\\':
 232         switch (c = getChar()) {
 233         case 'n':
 234           c2 = '\n';
 235           break;
 236         case 'r':
 237           c2 = '\r';
 238           break;
 239         case 't':
 240           c2 = '\t';
 241           break;
 242         case 'b':
 243           c2 = '\b';
 244           break;
 245         case 'f':
 246           c2 = '\f';
 247           break;
 248         case '\\':
 249         case '(':
 250         case ')':
 251           c2 = c;
 252           break;
 253         case '0': case '1': case '2': case '3':
 254         case '4': case '5': case '6': case '7':
 255           c2 = c - '0';
 256           c = lookChar();
 257           if (c >= '0' && c <= '7') {
 258             getChar();
 259             c2 = (c2 << 3) + (c - '0');
 260             c = lookChar();
 261             if (c >= '0' && c <= '7') {
 262               getChar();
 263               c2 = (c2 << 3) + (c - '0');
 264             }
 265           }
 266           break;
 267         case '\r':
 268           c = lookChar();
 269           if (c == '\n') {
 270             getChar();
 271           }
 272           break;
 273         case '\n':
 274           break;
 275         case EOF:
 276           error(getPos(), "Unterminated string");
 277           done = gTrue;
 278           break;
 279         default:
 280           c2 = c;
 281           break;
 282         }
 283         break;
 284
 285       default:
 286         c2 = c;
 287         break;
 288       }
 289
 290       if (c2 != EOF) {
 291         if (n == tokBufSize) {
 292           if (!s)
 293             s = new GString(tokBuf, tokBufSize);
 294           else
 295             s->append(tokBuf, tokBufSize);
 296           p = tokBuf;
 297           n = 0;
 298         }
 299         *p++ = (char)c2;
 300         ++n;
 301       }
 302     } while (!done);
 303     if (!s)
 304       s = new GString(tokBuf, n);
 305     else
 306       s->append(tokBuf, n);
 307     obj->initString(s);
 308     break;
 309
 310   // name
 311   case '/':
 312     p = tokBuf;
 313     n = 0;
 314     while ((c = lookChar()) != EOF && !specialChars[c]) {
 315       getChar();
 316       if (c == '#') {
 317         c2 = lookChar();
 318         if (c2 >= '0' && c2 <= '9') {
 319           c = c2 - '0';
 320         } else if (c2 >= 'A' && c2 <= 'F') {
 321           c = c2 - 'A' + 10;
 322         } else if (c2 >= 'a' && c2 <= 'f') {
 323           c = c2 - 'a' + 10;
 324         } else {
 325           goto notEscChar;
 326         }
 327         getChar();
 328         c <<= 4;
 329         c2 = getChar();
 330         if (c2 >= '0' && c2 <= '9') {
 331           c += c2 - '0';
 332         } else if (c2 >= 'A' && c2 <= 'F') {
 333           c += c2 - 'A' + 10;
 334         } else if (c2 >= 'a' && c2 <= 'f') {
 335           c += c2 - 'a' + 10;
 336         } else {
 337           illegalChars++;
 338           //error(getPos(), "Illegal digit in hex char in name");
 339         }
 340       }
 341      notEscChar:
 342       if (++n == tokBufSize) {
 343         error(getPos(), "Name token too long");
 344         break;
 345       }
 346       *p++ = c;
 347     }
 348     *p = '\0';
 349     obj->initName(tokBuf);
 350     break;
 351
 352   // array punctuation
 353   case '[':
 354   case ']':
 355     tokBuf[0] = c;
 356     tokBuf[1] = '\0';
 357     obj->initCmd(tokBuf);
 358     break;
 359
 360   // hex string or dict punctuation
 361   case '<':
 362     c = lookChar();
 363
 364     // dict punctuation
 365     if (c == '<') {
 366       getChar();
 367       tokBuf[0] = tokBuf[1] = '<';
 368       tokBuf[2] = '\0';
 369       obj->initCmd(tokBuf);
 370
 371     // hex string
 372     } else {
 373       p = tokBuf;
 374       m = n = 0;
 375       c2 = 0;
 376       s = NULL;
 377       while (1) {
 378         c = getChar();
 379         if (c == '>') {
 380           break;
 381         } else if (c == EOF) {
 382           error(getPos(), "Unterminated hex string");
 383           break;
 384         } else if (specialChars[c] != 1) {
 385           c2 = c2 << 4;
 386           if (c >= '0' && c <= '9')
 387             c2 += c - '0';
 388           else if (c >= 'A' && c <= 'F')
 389             c2 += c - 'A' + 10;
 390           else if (c >= 'a' && c <= 'f')
 391             c2 += c - 'a' + 10;
 392           else {
 393             illegalChars++;
 394             //error(getPos(), "Illegal character <%02x> in hex string", c);
 395           }
 396           if (++m == 2) {
 397             if (n == tokBufSize) {
 398               if (!s)
 399                 s = new GString(tokBuf, tokBufSize);
 400               else
 401                 s->append(tokBuf, tokBufSize);
 402               p = tokBuf;
 403               n = 0;
 404             }
 405             *p++ = (char)c2;
 406             ++n;
 407             c2 = 0;
 408             m = 0;
 409           }
 410         }
 411       }
 412       if (!s)
 413         s = new GString(tokBuf, n);
 414       else
 415         s->append(tokBuf, n);
 416       if (m == 1)
 417         s->append((char)(c2 << 4));
 418       obj->initString(s);
 419     }
 420     break;
 421
 422   // dict punctuation
 423   case '>':
 424     c = lookChar();
 425     if (c == '>') {
 426       getChar();
 427       tokBuf[0] = tokBuf[1] = '>';
 428       tokBuf[2] = '\0';
 429       obj->initCmd(tokBuf);
 430     } else {
 431       illegalChars++;
 432       //error(getPos(), "Illegal character '>'");
 433       obj->initError();
 434     }
 435     break;
 436
 437   // error
 438   case ')':
 439   case '{':
 440   case '}':
 441     //error(getPos(), "Illegal character '%c'", c);
 442     illegalChars++;
 443     obj->initError();
 444     break;
 445
 446   // command
 447   default:
 448     p = tokBuf;
 449     *p++ = c;
 450     n = 1;
 451     while ((c = lookChar()) != EOF && !specialChars[c]) {
 452       getChar();
 453       if (++n == tokBufSize) {
 454         error(getPos(), "Command token too long");
 455         break;
 456       }
 457       *p++ = c;
 458     }
 459     *p = '\0';
 460     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
 461       obj->initBool(gTrue);
 462     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
 463       obj->initBool(gFalse);
 464     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
 465       obj->initNull();
 466     } else {
 467       obj->initCmd(tokBuf);
 468     }
 469     break;
 470   }
 471   return obj;
 472 }
 473
 474 void Lexer::skipToNextLine() {
 475   int c;
 476
 477   while (1) {
 478     c = getChar();
 479     if (c == EOF || c == '\n') {
 480       return;
 481     }
 482     if (c == '\r') {
 483       if ((c = lookChar()) == '\n') {
 484         getChar();
 485       }
 486       return;
 487     }
 488   }
 489 }
 490
 491 GBool Lexer::isSpace(int c) {
 492   return c >= 0 && c <= 0xff && specialChars[c] == 1;
 493 }