pdf2swf/xpdf/Lexer.cc

   1 //========================================================================
   2 //
   3 // Lexer.cc
   4 //
   5 // Copyright 1996-2002 Glyph & Cog, LLC
   6 //
   7 //========================================================================
   8
   9 #ifdef __GNUC__
  10 #pragma implementation
  11 #endif
  12
  13 #include <aconf.h>
  14 #include <stdlib.h>
  15 #include <stddef.h>
  16 #include <string.h>
  17 #include <ctype.h>
  18 #include "Lexer.h"
  19 #include "Error.h"
  20
  21 //------------------------------------------------------------------------
  22
  23 // A '1' in this array means the character is white space.  A '1' or
  24 // '2' means the character ends a name or command.
  25 static char specialChars[256] = {
  26   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  27   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  28   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  29   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  30   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  31   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  32   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  33   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  34   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  35   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  36   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  37   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  38   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  41   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
  42 };
  43
  44 //------------------------------------------------------------------------
  45 // Lexer
  46 //------------------------------------------------------------------------
  47
  48 Lexer::Lexer(XRef *xref, Stream *str) {
  49   Object obj;
  50
  51   curStr.initStream(str);
  52   streams = new Array(xref);
  53   streams->add(curStr.copy(&obj));
  54   strPtr = 0;
  55   freeArray = gTrue;
  56   curStr.streamReset();
  57 }
  58
  59 Lexer::Lexer(XRef *xref, Object *obj) {
  60   Object obj2;
  61
  62   if (obj->isStream()) {
  63     streams = new Array(xref);
  64     freeArray = gTrue;
  65     streams->add(obj->copy(&obj2));
  66   } else {
  67     streams = obj->getArray();
  68     freeArray = gFalse;
  69   }
  70   strPtr = 0;
  71   if (streams->getLength() > 0) {
  72     streams->get(strPtr, &curStr);
  73     curStr.streamReset();
  74   }
  75 }
  76
  77 Lexer::~Lexer() {
  78   if (!curStr.isNone()) {
  79     curStr.streamClose();
  80     curStr.free();
  81   }
  82   if (freeArray) {
  83     delete streams;
  84   }
  85 }
  86
  87 int Lexer::getChar() {
  88   int c;
  89
  90   c = EOF;
  91   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
  92     curStr.streamClose();
  93     curStr.free();
  94     ++strPtr;
  95     if (strPtr < streams->getLength()) {
  96       streams->get(strPtr, &curStr);
  97       curStr.streamReset();
  98     }
  99   }
 100   return c;
 101 }
 102
 103 int Lexer::lookChar() {
 104   if (curStr.isNone()) {
 105     return EOF;
 106   }
 107   return curStr.streamLookChar();
 108 }
 109
 110 Object *Lexer::getObj(Object *obj) {
 111   char *p;
 112   int c, c2;
 113   GBool comment, neg, done;
 114   int numParen;
 115   int xi;
 116   double xf, scale;
 117   GString *s;
 118   int n, m;
 119
 120   // skip whitespace and comments
 121   comment = gFalse;
 122   while (1) {
 123     if ((c = getChar()) == EOF) {
 124       return obj->initEOF();
 125     }
 126     if (comment) {
 127       if (c == '\r' || c == '\n')
 128         comment = gFalse;
 129     } else if (c == '%') {
 130       comment = gTrue;
 131     } else if (specialChars[c] != 1) {
 132       break;
 133     }
 134   }
 135
 136   // start reading token
 137   switch (c) {
 138
 139   // number
 140   case '0': case '1': case '2': case '3': case '4':
 141   case '5': case '6': case '7': case '8': case '9':
 142   case '-': case '.':
 143     neg = gFalse;
 144     xi = 0;
 145     if (c == '-') {
 146       neg = gTrue;
 147     } else if (c == '.') {
 148       goto doReal;
 149     } else {
 150       xi = c - '0';
 151     }
 152     while (1) {
 153       c = lookChar();
 154       if (isdigit(c)) {
 155         getChar();
 156         xi = xi * 10 + (c - '0');
 157       } else if (c == '.') {
 158         getChar();
 159         goto doReal;
 160       } else {
 161         break;
 162       }
 163     }
 164     if (neg)
 165       xi = -xi;
 166     obj->initInt(xi);
 167     break;
 168   doReal:
 169     xf = xi;
 170     scale = 0.1;
 171     while (1) {
 172       c = lookChar();
 173       if (!isdigit(c)) {
 174         break;
 175       }
 176       getChar();
 177       xf = xf + scale * (c - '0');
 178       scale *= 0.1;
 179     }
 180     if (neg)
 181       xf = -xf;
 182     obj->initReal(xf);
 183     break;
 184
 185   // string
 186   case '(':
 187     p = tokBuf;
 188     n = 0;
 189     numParen = 1;
 190     done = gFalse;
 191     s = NULL;
 192     do {
 193       c2 = EOF;
 194       switch (c = getChar()) {
 195
 196       case EOF:
 197 #if 0
 198       // This breaks some PDF files, e.g., ones from Photoshop.
 199       case '\r':
 200       case '\n':
 201 #endif
 202         error(getPos(), "Unterminated string");
 203         done = gTrue;
 204         break;
 205
 206       case '(':
 207         ++numParen;
 208         c2 = c;
 209         break;
 210
 211       case ')':
 212         if (--numParen == 0) {
 213           done = gTrue;
 214         } else {
 215           c2 = c;
 216         }
 217         break;
 218
 219       case '\\':
 220         switch (c = getChar()) {
 221         case 'n':
 222           c2 = '\n';
 223           break;
 224         case 'r':
 225           c2 = '\r';
 226           break;
 227         case 't':
 228           c2 = '\t';
 229           break;
 230         case 'b':
 231           c2 = '\b';
 232           break;
 233         case 'f':
 234           c2 = '\f';
 235           break;
 236         case '\\':
 237         case '(':
 238         case ')':
 239           c2 = c;
 240           break;
 241         case '0': case '1': case '2': case '3':
 242         case '4': case '5': case '6': case '7':
 243           c2 = c - '0';
 244           c = lookChar();
 245           if (c >= '0' && c <= '7') {
 246             getChar();
 247             c2 = (c2 << 3) + (c - '0');
 248             c = lookChar();
 249             if (c >= '0' && c <= '7') {
 250               getChar();
 251               c2 = (c2 << 3) + (c - '0');
 252             }
 253           }
 254           break;
 255         case '\r':
 256           c = lookChar();
 257           if (c == '\n') {
 258             getChar();
 259           }
 260           break;
 261         case '\n':
 262           break;
 263         case EOF:
 264           error(getPos(), "Unterminated string");
 265           done = gTrue;
 266           break;
 267         default:
 268           c2 = c;
 269           break;
 270         }
 271         break;
 272
 273       default:
 274         c2 = c;
 275         break;
 276       }
 277
 278       if (c2 != EOF) {
 279         if (n == tokBufSize) {
 280           if (!s)
 281             s = new GString(tokBuf, tokBufSize);
 282           else
 283             s->append(tokBuf, tokBufSize);
 284           p = tokBuf;
 285           n = 0;
 286         }
 287         *p++ = (char)c2;
 288         ++n;
 289       }
 290     } while (!done);
 291     if (!s)
 292       s = new GString(tokBuf, n);
 293     else
 294       s->append(tokBuf, n);
 295     obj->initString(s);
 296     break;
 297
 298   // name
 299   case '/':
 300     p = tokBuf;
 301     n = 0;
 302     while ((c = lookChar()) != EOF && !specialChars[c]) {
 303       getChar();
 304       if (c == '#') {
 305         c2 = lookChar();
 306         if (c2 >= '0' && c2 <= '9') {
 307           c = c2 - '0';
 308         } else if (c2 >= 'A' && c2 <= 'F') {
 309           c = c2 - 'A' + 10;
 310         } else if (c2 >= 'a' && c2 <= 'f') {
 311           c = c2 - 'a' + 10;
 312         } else {
 313           goto notEscChar;
 314         }
 315         getChar();
 316         c <<= 4;
 317         c2 = getChar();
 318         if (c2 >= '0' && c2 <= '9') {
 319           c += c2 - '0';
 320         } else if (c2 >= 'A' && c2 <= 'F') {
 321           c += c2 - 'A' + 10;
 322         } else if (c2 >= 'a' && c2 <= 'f') {
 323           c += c2 - 'a' + 10;
 324         } else {
 325           error(getPos(), "Illegal digit in hex char in name");
 326         }
 327       }
 328      notEscChar:
 329       if (++n == tokBufSize) {
 330         error(getPos(), "Name token too long");
 331         break;
 332       }
 333       *p++ = c;
 334     }
 335     *p = '\0';
 336     obj->initName(tokBuf);
 337     break;
 338
 339   // array punctuation
 340   case '[':
 341   case ']':
 342     tokBuf[0] = c;
 343     tokBuf[1] = '\0';
 344     obj->initCmd(tokBuf);
 345     break;
 346
 347   // hex string or dict punctuation
 348   case '<':
 349     c = lookChar();
 350
 351     // dict punctuation
 352     if (c == '<') {
 353       getChar();
 354       tokBuf[0] = tokBuf[1] = '<';
 355       tokBuf[2] = '\0';
 356       obj->initCmd(tokBuf);
 357
 358     // hex string
 359     } else {
 360       p = tokBuf;
 361       m = n = 0;
 362       c2 = 0;
 363       s = NULL;
 364       while (1) {
 365         c = getChar();
 366         if (c == '>') {
 367           break;
 368         } else if (c == EOF) {
 369           error(getPos(), "Unterminated hex string");
 370           break;
 371         } else if (specialChars[c] != 1) {
 372           c2 = c2 << 4;
 373           if (c >= '0' && c <= '9')
 374             c2 += c - '0';
 375           else if (c >= 'A' && c <= 'F')
 376             c2 += c - 'A' + 10;
 377           else if (c >= 'a' && c <= 'f')
 378             c2 += c - 'a' + 10;
 379           else
 380             error(getPos(), "Illegal character <%02x> in hex string", c);
 381           if (++m == 2) {
 382             if (n == tokBufSize) {
 383               if (!s)
 384                 s = new GString(tokBuf, tokBufSize);
 385               else
 386                 s->append(tokBuf, tokBufSize);
 387               p = tokBuf;
 388               n = 0;
 389             }
 390             *p++ = (char)c2;
 391             ++n;
 392             c2 = 0;
 393             m = 0;
 394           }
 395         }
 396       }
 397       if (!s)
 398         s = new GString(tokBuf, n);
 399       else
 400         s->append(tokBuf, n);
 401       if (m == 1)
 402         s->append((char)(c2 << 4));
 403       obj->initString(s);
 404     }
 405     break;
 406
 407   // dict punctuation
 408   case '>':
 409     c = lookChar();
 410     if (c == '>') {
 411       getChar();
 412       tokBuf[0] = tokBuf[1] = '>';
 413       tokBuf[2] = '\0';
 414       obj->initCmd(tokBuf);
 415     } else {
 416       error(getPos(), "Illegal character '>'");
 417       obj->initError();
 418     }
 419     break;
 420
 421   // error
 422   case ')':
 423   case '{':
 424   case '}':
 425     error(getPos(), "Illegal character '%c'", c);
 426     obj->initError();
 427     break;
 428
 429   // command
 430   default:
 431     p = tokBuf;
 432     *p++ = c;
 433     n = 1;
 434     while ((c = lookChar()) != EOF && !specialChars[c]) {
 435       getChar();
 436       if (++n == tokBufSize) {
 437         error(getPos(), "Command token too long");
 438         break;
 439       }
 440       *p++ = c;
 441     }
 442     *p = '\0';
 443     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
 444       obj->initBool(gTrue);
 445     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
 446       obj->initBool(gFalse);
 447     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
 448       obj->initNull();
 449     } else {
 450       obj->initCmd(tokBuf);
 451     }
 452     break;
 453   }
 454
 455   return obj;
 456 }
 457
 458 void Lexer::skipToNextLine() {
 459   int c;
 460
 461   while (1) {
 462     c = getChar();
 463     if (c == EOF || c == '\n') {
 464       return;
 465     }
 466     if (c == '\r') {
 467       if ((c = lookChar()) == '\n') {
 468         getChar();
 469       }
 470       return;
 471     }
 472   }
 473 }