From 5f639e1d63bbb39eb0e9cb6d4aca28aff1ac21e3 Mon Sep 17 00:00:00 2001 From: Matthias Kramm Date: Sat, 12 Dec 2009 19:03:19 -0800 Subject: [PATCH] fixed utf8 handling --- lib/as3/Makefile | 10 ++++++++-- lib/as3/main.c | 4 +++- lib/as3/tokenizer.lex | 21 +++++++++++++++++---- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/lib/as3/Makefile b/lib/as3/Makefile index be18901..b3ecd23 100644 --- a/lib/as3/Makefile +++ b/lib/as3/Makefile @@ -5,17 +5,23 @@ tests: testwrite testrewrite testpaths testreadwrite D=-g -pg +#BISONDEBUG=yes + MODULES = abc.o opcodes.o code.o pool.o scripts.o expr.o common.o initcode.o SOURCES = abc.c abc.h pool.c pool.h files.c files.h code.c code.h registry.c registry.h opcodes.c opcodes.h builtin.c builtin.h compiler.c compiler.h parser.tab.h parser.tab.c tokenizer.yy.c scripts.c import.c import.h expr.c expr.h common.c common.h initcode.c initcode.h tokenizer.yy.c: tokenizer.lex tokenizer.h flex -Pas3_ -8 -B -otokenizer.yy.c tokenizer.lex +ifeq "$(BISONDEBUG)" "yes" +BISONDEBUGFLAGS=-t +BISONDEBUGDEFINE=-DBISONDEBUG +endif parser.tab.h parser.tab.c: parser.y parser.h skeleton.m4 Makefile - bison -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c + bison $(BISONDEBUGFLAGS) -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c main.o: main.c parser.tab.h parser.h - $(C) main.c -o main.o + $(C) $(BISONDEBUGDEFINE) main.c -o main.o mklib.o: mklib.c parser.tab.h parser.h $(C) mklib.c -o mklib.o diff --git a/lib/as3/main.c b/lib/as3/main.c index 2835556..85b36a6 100644 --- a/lib/as3/main.c +++ b/lib/as3/main.c @@ -64,7 +64,9 @@ int main(int argn, char*argv[]) } filename=argv[argn-1]; - //a3_debug = 1; //if bison was called with -t +#ifdef BISONDEBUG + a3_debug = 1; //if bison was called with -t +#endif as3_add_include_dir(getcwd(buf, 512)); diff --git a/lib/as3/tokenizer.lex b/lib/as3/tokenizer.lex index 4e0495d..05fd407 100644 --- a/lib/as3/tokenizer.lex +++ b/lib/as3/tokenizer.lex @@ -481,6 +481,7 @@ void tokenizer_unregister_namespace(const char*id) }*/ static inline char tokenizer_is_namespace(const char*id) { + if(!active_namespaces) return 0; return trie_contains(active_namespaces, (const unsigned char*)id); } @@ -514,8 +515,20 @@ static int tokenerror(); %x XMLTEXT %x XML -NAME [a-zA-Z_\x80-\xff][a-zA-Z0-9_\\\x80-\xff]* -_ [^a-zA-Z0-9_\\\x80-\xff] +X1 parsing identifiers with a non unicode lexer is a knightmare we have to skip all possible +X2 combinations of byte order markers or utf8 space chars and i dont quite like the fact that +X3 lex doesnt support proper comments in this section either... +X4 {NAME_HEAD}{NAME_TAIL} + +NAME_NOC2EF [a-zA-Z_\x80-\xc1\xc3-\xee\xf0-\xff] +NAME_EF [\xef][a-zA-Z0-9_\\\x80-\xba\xbc-\xff] +NAME_C2 [\xc2][a-zA-Z0-9_\\\x80-\x9f\xa1-\xff] +NAME_EFBB [\xef][\xbb][a-zA-Z0-9_\\\x80-\xbe\xc0-\xff] +NAME_TAIL [a-zA-Z_0-9\\\x80-\xff]* +NAME_HEAD (({NAME_NOC2EF})|({NAME_EF})|({NAME_C2})|({NAME_EFBB})) +NAME {NAME_HEAD}{NAME_TAIL} + +_ [^a-zA-Z0-9_\\\x80-\xff] HEXINT 0x[a-zA-Z0-9]+ HEXFLOAT 0x[a-zA-Z0-9]*\.[a-zA-Z0-9]* @@ -534,7 +547,7 @@ XMLID [A-Za-z0-9_\x80-\xff]+([:][A-Za-z0-9_\x80-\xff]+)? XMLSTRING ["][^"]*["] STRING ["](\\[\x00-\xff]|[^\\"\n])*["]|['](\\[\x00-\xff]|[^\\'\n])*['] -S ([ \n\r\t\xa0]|\xc2\xa0) +S ([ \n\r\t\xa0]|[\xc2][\xa0]) MULTILINE_COMMENT [/][*]+([*][^/]|[^/*]|[^*][/]|[\x00-\x1f])*[*]+[/] SINGLELINE_COMMENT \/\/[^\n\r]*[\n\r] REGEXP [/]([^/\n]|\\[/])*[/][a-zA-Z]* @@ -587,7 +600,7 @@ REGEXP [/]([^/\n]|\\[/])*[/][a-zA-Z]* [\{] {c(); BEGIN(REGEXPOK);return m(T_DICTSTART);} [\{] {c(); BEGIN(DEFAULT); return m('{');} -\xef\xbb\xbf {/* utf 8 bom */} +\xef\xbb\xbf {/* utf 8 bom (0xfeff) */} {S} {l();} {HEXINT}/{_} {c(); BEGIN(DEFAULT);return handlehex();} -- 1.7.10.4