fixed utf8 handling

author Matthias Kramm <kramm@quiss.org>

Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)

committer Matthias Kramm <kramm@quiss.org>

Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)
author Matthias Kramm <kramm@quiss.org>
Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)
committer Matthias Kramm <kramm@quiss.org>
Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)
diff --git a/lib/as3/Makefile b/lib/as3/Makefile

index be18901..b3ecd23 100644 (file)
--- a/lib/as3/Makefile
+++ b/lib/as3/Makefile
@@ -5,17 +5,23 @@ tests: testwrite testrewrite testpaths testreadwrite
  
  D=-g -pg
  
  
  D=-g -pg
  
+#BISONDEBUG=yes
+
  MODULES = abc.o opcodes.o code.o pool.o scripts.o expr.o common.o initcode.o
  SOURCES = abc.c abc.h pool.c pool.h files.c files.h code.c code.h registry.c registry.h opcodes.c opcodes.h builtin.c builtin.h compiler.c compiler.h parser.tab.h parser.tab.c tokenizer.yy.c scripts.c import.c import.h expr.c expr.h common.c common.h initcode.c initcode.h
  
  tokenizer.yy.c: tokenizer.lex tokenizer.h
         flex -Pas3_ -8 -B -otokenizer.yy.c tokenizer.lex
  
  MODULES = abc.o opcodes.o code.o pool.o scripts.o expr.o common.o initcode.o
  SOURCES = abc.c abc.h pool.c pool.h files.c files.h code.c code.h registry.c registry.h opcodes.c opcodes.h builtin.c builtin.h compiler.c compiler.h parser.tab.h parser.tab.c tokenizer.yy.c scripts.c import.c import.h expr.c expr.h common.c common.h initcode.c initcode.h
  
  tokenizer.yy.c: tokenizer.lex tokenizer.h
         flex -Pas3_ -8 -B -otokenizer.yy.c tokenizer.lex
  
+ifeq "$(BISONDEBUG)" "yes"
+BISONDEBUGFLAGS=-t
+BISONDEBUGDEFINE=-DBISONDEBUG
+endif
  parser.tab.h parser.tab.c: parser.y parser.h skeleton.m4 Makefile
  parser.tab.h parser.tab.c: parser.y parser.h skeleton.m4 Makefile
-       bison -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c
+       bison $(BISONDEBUGFLAGS) -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c
  
  main.o: main.c parser.tab.h parser.h
  
  main.o: main.c parser.tab.h parser.h
-       $(C) main.c -o main.o
+       $(C) $(BISONDEBUGDEFINE) main.c -o main.o
  
  mklib.o: mklib.c parser.tab.h parser.h
         $(C) mklib.c -o mklib.o
  
  mklib.o: mklib.c parser.tab.h parser.h
         $(C) mklib.c -o mklib.o
diff --git a/lib/as3/main.c b/lib/as3/main.c

index 2835556..85b36a6 100644 (file)
--- a/lib/as3/main.c
+++ b/lib/as3/main.c
@@ -64,7 +64,9 @@ int main(int argn, char*argv[])
      }
      filename=argv[argn-1];
  
      }
      filename=argv[argn-1];
  
-    //a3_debug = 1; //if bison was called with -t
+#ifdef BISONDEBUG
+    a3_debug = 1; //if bison was called with -t
+#endif
     
      as3_add_include_dir(getcwd(buf, 512));
  
     
      as3_add_include_dir(getcwd(buf, 512));
  
diff --git a/lib/as3/tokenizer.lex b/lib/as3/tokenizer.lex

index 4e0495d..05fd407 100644 (file)
--- a/lib/as3/tokenizer.lex
+++ b/lib/as3/tokenizer.lex
@@ -481,6 +481,7 @@ void tokenizer_unregister_namespace(const char*id)
  }*/
  static inline char tokenizer_is_namespace(const char*id)
  {
  }*/
  static inline char tokenizer_is_namespace(const char*id)
  {
+    if(!active_namespaces) return 0;
      return trie_contains(active_namespaces, (const unsigned char*)id);
  }
  
      return trie_contains(active_namespaces, (const unsigned char*)id);
  }
  
@@ -514,8 +515,20 @@ static int tokenerror();
  %x XMLTEXT
  %x XML
  
  %x XMLTEXT
  %x XML
  
-NAME    [a-zA-Z_\x80-\xff][a-zA-Z0-9_\\\x80-\xff]*
-_        [^a-zA-Z0-9_\\\x80-\xff]
+X1 parsing identifiers with a non unicode lexer is a knightmare we have to skip all possible
+X2 combinations of byte order markers or utf8 space chars and i dont quite like the fact that
+X3 lex doesnt support proper comments in this section either...
+X4 {NAME_HEAD}{NAME_TAIL} 
+
+NAME_NOC2EF  [a-zA-Z_\x80-\xc1\xc3-\xee\xf0-\xff]
+NAME_EF      [\xef][a-zA-Z0-9_\\\x80-\xba\xbc-\xff]
+NAME_C2      [\xc2][a-zA-Z0-9_\\\x80-\x9f\xa1-\xff]
+NAME_EFBB    [\xef][\xbb][a-zA-Z0-9_\\\x80-\xbe\xc0-\xff]
+NAME_TAIL    [a-zA-Z_0-9\\\x80-\xff]*
+NAME_HEAD    (({NAME_NOC2EF})|({NAME_EF})|({NAME_C2})|({NAME_EFBB}))
+NAME        {NAME_HEAD}{NAME_TAIL} 
+
+_            [^a-zA-Z0-9_\\\x80-\xff]
  
  HEXINT    0x[a-zA-Z0-9]+
  HEXFLOAT  0x[a-zA-Z0-9]*\.[a-zA-Z0-9]*
  
  HEXINT    0x[a-zA-Z0-9]+
  HEXFLOAT  0x[a-zA-Z0-9]*\.[a-zA-Z0-9]*
@@ -534,7 +547,7 @@ XMLID       [A-Za-z0-9_\x80-\xff]+([:][A-Za-z0-9_\x80-\xff]+)?
  XMLSTRING   ["][^"]*["]
  
  STRING   ["](\\[\x00-\xff]|[^\\"\n])*["]|['](\\[\x00-\xff]|[^\\'\n])*[']
  XMLSTRING   ["][^"]*["]
  
  STRING   ["](\\[\x00-\xff]|[^\\"\n])*["]|['](\\[\x00-\xff]|[^\\'\n])*[']
-S       ([ \n\r\t\xa0]|\xc2\xa0)
+S       ([ \n\r\t\xa0]|[\xc2][\xa0])
  MULTILINE_COMMENT [/][*]+([*][^/]|[^/*]|[^*][/]|[\x00-\x1f])*[*]+[/]
  SINGLELINE_COMMENT \/\/[^\n\r]*[\n\r]
  REGEXP   [/]([^/\n]|\\[/])*[/][a-zA-Z]*
  MULTILINE_COMMENT [/][*]+([*][^/]|[^/*]|[^*][/]|[\x00-\x1f])*[*]+[/]
  SINGLELINE_COMMENT \/\/[^\n\r]*[\n\r]
  REGEXP   [/]([^/\n]|\\[/])*[/][a-zA-Z]*
@@ -587,7 +600,7 @@ REGEXP   [/]([^/\n]|\\[/])*[/][a-zA-Z]*
  <REGEXPOK>[\{]               {c(); BEGIN(REGEXPOK);return m(T_DICTSTART);}
  [\{]                         {c(); BEGIN(DEFAULT); return m('{');}
  
  <REGEXPOK>[\{]               {c(); BEGIN(REGEXPOK);return m(T_DICTSTART);}
  [\{]                         {c(); BEGIN(DEFAULT); return m('{');}
  
-\xef\xbb\xbf                 {/* utf 8 bom */}
+\xef\xbb\xbf                 {/* utf 8 bom (0xfeff) */}
  {S}                          {l();}
  
  {HEXINT}/{_}                 {c(); BEGIN(DEFAULT);return handlehex();}
  {S}                          {l();}
  
  {HEXINT}/{_}                 {c(); BEGIN(DEFAULT);return handlehex();}
author	Matthias Kramm <kramm@quiss.org>
	Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)
committer	Matthias Kramm <kramm@quiss.org>
	Sun, 13 Dec 2009 03:03:19 +0000 (19:03 -0800)
lib/as3/Makefile		patch \| blob \| history
lib/as3/main.c		patch \| blob \| history
lib/as3/tokenizer.lex		patch \| blob \| history