1 /***********************************************************************
3 A JavaScript tokenizer / parser / beautifier / compressor.
5 This version is suitable for Node.js. With minimal changes (the
6 exports stuff) it should work on any JS platform.
8 This file contains the tokenizer/parser. It is a port to JavaScript
9 of parse-js [1], a JavaScript parser library written in Common Lisp
10 by Marijn Haverbeke. Thank you Marijn!
12 [1] http://marijn.haverbeke.nl/parse-js/
16 - tokenizer(code) -- returns a function. Call the returned
17 function to fetch the next token.
19 - parse(code) -- returns an AST of the given JavaScript code.
21 -------------------------------- (C) ---------------------------------
24 <mihai.bazon@gmail.com>
25 http://mihai.bazon.net/blog
27 Distributed under the BSD license:
29 Copyright 2010 (c) Mihai Bazon <mihai.bazon@gmail.com>
30 Based on parse-js (http://marijn.haverbeke.nl/parse-js/).
32 Redistribution and use in source and binary forms, with or without
33 modification, are permitted provided that the following conditions
36 * Redistributions of source code must retain the above
37 copyright notice, this list of conditions and the following
40 * Redistributions in binary form must reproduce the above
41 copyright notice, this list of conditions and the following
42 disclaimer in the documentation and/or other materials
43 provided with the distribution.
45 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER “AS IS” AND ANY
46 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE
49 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
50 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
54 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
55 THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 ***********************************************************************/
60 /* -----[ Tokenizer (constants) ]----- */
62 var KEYWORDS = array_to_hash([
90 var RESERVED_WORDS = array_to_hash([
123 var KEYWORDS_BEFORE_EXPRESSION = array_to_hash([
132 var KEYWORDS_ATOM = array_to_hash([
139 var OPERATOR_CHARS = array_to_hash(characters("+-*&%=<>!?|~^"));
141 var RE_HEX_NUMBER = /^0x[0-9a-f]+$/i;
142 var RE_OCT_NUMBER = /^0[0-7]+$/;
143 var RE_DEC_NUMBER = /^\d*\.?\d*(?:e[+-]?\d*(?:\d\.?|\.?\d)\d*)?$/i;
145 var OPERATORS = array_to_hash([
194 var WHITESPACE_CHARS = array_to_hash(characters(" \n\r\t"));
196 var PUNC_BEFORE_EXPRESSION = array_to_hash(characters("[{}(,.;:"));
198 var PUNC_CHARS = array_to_hash(characters("[]{}(),;:"));
200 var REGEXP_MODIFIERS = array_to_hash(characters("gmsiy"));
202 /* -----[ Tokenizer ]----- */
204 function is_alphanumeric_char(ch) {
205 ch = ch.charCodeAt(0);
206 return (ch >= 48 && ch <= 57) ||
207 (ch >= 65 && ch <= 90) ||
208 (ch >= 97 && ch <= 122);
211 function is_identifier_char(ch) {
212 return is_alphanumeric_char(ch) || ch == "$" || ch == "_";
215 function is_digit(ch) {
216 ch = ch.charCodeAt(0);
217 return ch >= 48 && ch <= 57;
220 function parse_js_number(num) {
221 if (RE_HEX_NUMBER.test(num)) {
222 return parseInt(num.substr(2), 16);
223 } else if (RE_OCT_NUMBER.test(num)) {
224 return parseInt(num.substr(1), 8);
225 } else if (RE_DEC_NUMBER.test(num)) {
226 return parseFloat(num);
230 function JS_Parse_Error(message, line, col, pos) {
231 this.message = message;
238 this.stack = ex.stack;
242 JS_Parse_Error.prototype.toString = function() {
243 return this.message + " (line: " + this.line + ", col: " + this.col + ", pos: " + this.pos + ")" + "\n\n" + this.stack;
246 function js_error(message, line, col, pos) {
247 throw new JS_Parse_Error(message, line, col, pos);
250 function is_token(token, type, val) {
251 return token.type == type && (val == null || token.value == val);
256 function tokenizer($TEXT, skip_comments) {
259 text : $TEXT.replace(/\r\n?|[\n\u2028\u2029]/g, "\n").replace(/^\uFEFF/, ''),
266 newline_before : false,
267 regex_allowed : false
270 function peek() { return S.text.charAt(S.pos); };
272 function next(signal_eof) {
273 var ch = S.text.charAt(S.pos++);
274 if (signal_eof && !ch)
277 S.newline_before = true;
290 function find(what, signal_eof) {
291 var pos = S.text.indexOf(what, S.pos);
292 if (signal_eof && pos == -1) throw EX_EOF;
296 function start_token() {
302 function token(type, value) {
303 S.regex_allowed = ((type == "operator" && !HOP(UNARY_POSTFIX, value)) ||
304 (type == "keyword" && HOP(KEYWORDS_BEFORE_EXPRESSION, value)) ||
305 (type == "punc" && HOP(PUNC_BEFORE_EXPRESSION, value)));
312 nlb : S.newline_before
314 S.newline_before = false;
318 function skip_whitespace() {
319 while (HOP(WHITESPACE_CHARS, peek()))
323 function read_while(pred) {
324 var ret = "", ch = peek(), i = 0;
325 while (ch && pred(ch, i++)) {
332 function parse_error(err) {
333 js_error(err, S.tokline, S.tokcol, S.tokpos);
336 function read_num(prefix) {
337 var has_e = false, after_e = false, has_x = false, has_dot = prefix == ".";
338 var num = read_while(function(ch, i){
339 if (ch == "x" || ch == "X") {
340 if (has_x) return false;
343 if (!has_x && (ch == "E" || ch == "e")) {
344 if (has_e) return false;
345 return has_e = after_e = true;
348 if (after_e || (i == 0 && !prefix)) return true;
351 if (ch == "+") return after_e;
355 return has_dot = true;
358 return is_alphanumeric_char(ch);
362 var valid = parse_js_number(num);
364 return token("num", valid);
366 parse_error("Invalid syntax: " + num);
370 function read_escaped_char() {
373 case "n" : return "\n";
374 case "r" : return "\r";
375 case "t" : return "\t";
376 case "b" : return "\b";
377 case "v" : return "\v";
378 case "f" : return "\f";
379 case "0" : return "\0";
380 case "x" : return String.fromCharCode(hex_bytes(2));
381 case "u" : return String.fromCharCode(hex_bytes(4));
386 function hex_bytes(n) {
389 var digit = parseInt(next(true), 16);
391 parse_error("Invalid hex-character pattern in string");
392 num = (num << 4) | digit;
397 function read_string() {
398 return with_eof_error("Unterminated string constant", function(){
399 var quote = next(), ret = "";
402 if (ch == "\\") ch = read_escaped_char();
403 else if (ch == quote) break;
406 return token("string", ret);
410 function read_line_comment() {
412 var i = find("\n"), ret;
414 ret = S.text.substr(S.pos);
415 S.pos = S.text.length;
417 ret = S.text.substring(S.pos, i);
420 return token("comment1", ret);
423 function read_multiline_comment() {
425 return with_eof_error("Unterminated multiline comment", function(){
426 var i = find("*/", true),
427 text = S.text.substring(S.pos, i),
428 tok = token("comment2", text);
430 S.line += text.split("\n").length - 1;
431 S.newline_before = text.indexOf("\n") >= 0;
436 function read_regexp() {
437 return with_eof_error("Unterminated regular expression", function(){
438 var prev_backslash = false, regexp = "", ch, in_class = false;
439 while ((ch = next(true))) if (prev_backslash) {
441 prev_backslash = false;
442 } else if (ch == "[") {
445 } else if (ch == "]" && in_class) {
448 } else if (ch == "/" && !in_class) {
450 } else if (ch == "\\") {
451 prev_backslash = true;
455 var mods = read_while(function(ch){
456 return HOP(REGEXP_MODIFIERS, ch);
458 return token("regexp", [ regexp, mods ]);
462 function read_operator(prefix) {
464 var bigger = op + peek();
465 if (HOP(OPERATORS, bigger)) {
472 return token("operator", grow(prefix || next()));
475 var handle_slash = skip_comments ? function() {
477 var regex_allowed = S.regex_allowed;
479 case "/": read_line_comment(); S.regex_allowed = regex_allowed; return next_token();
480 case "*": read_multiline_comment(); S.regex_allowed = regex_allowed; return next_token();
482 return S.regex_allowed ? read_regexp() : read_operator("/");
486 case "/": return read_line_comment();
487 case "*": return read_multiline_comment();
489 return S.regex_allowed ? read_regexp() : read_operator("/");
492 function handle_dot() {
494 return is_digit(peek())
496 : token("punc", ".");
499 function read_word() {
500 var word = read_while(is_identifier_char);
501 return !HOP(KEYWORDS, word)
502 ? token("name", word)
503 : HOP(OPERATORS, word)
504 ? token("operator", word)
505 : HOP(KEYWORDS_ATOM, word)
506 ? token("atom", word)
507 : token("keyword", word);
510 function with_eof_error(eof_error, cont) {
514 if (ex === EX_EOF) parse_error(eof_error);
519 function next_token(force_regexp) {
521 return read_regexp();
525 if (!ch) return token("eof");
526 if (is_digit(ch)) return read_num();
527 if (ch == '"' || ch == "'") return read_string();
528 if (HOP(PUNC_CHARS, ch)) return token("punc", next());
529 if (ch == ".") return handle_dot();
530 if (ch == "/") return handle_slash();
531 if (HOP(OPERATOR_CHARS, ch)) return read_operator();
532 if (is_identifier_char(ch)) return read_word();
533 parse_error("Unexpected character '" + ch + "'");
536 next_token.context = function(nc) {
545 /* -----[ Parser (constants) ]----- */
547 var UNARY_PREFIX = array_to_hash([
559 var UNARY_POSTFIX = array_to_hash([ "--", "++" ]);
561 var ASSIGNMENT = (function(a, ret, i){
562 while (i < a.length) {
563 ret[a[i]] = a[i].substr(0, a[i].length - 1);
568 ["+=", "-=", "/=", "*=", "%=", ">>=", "<<=", ">>>=", "~=", "%=", "|=", "^=", "&="],
573 var PRECEDENCE = (function(a, ret){
574 for (var i = 0, n = 1; i < a.length; ++i, ++n) {
576 for (var j = 0; j < b.length; ++j) {
588 ["==", "===", "!=", "!=="],
589 ["<", ">", "<=", ">=", "in", "instanceof"],
597 var STATEMENTS_WITH_LABELS = array_to_hash([ "for", "do", "while", "switch" ]);
599 var ATOMIC_START_TOKEN = array_to_hash([ "atom", "num", "string", "regexp", "name" ]);
601 /* -----[ Parser ]----- */
603 function NodeWithToken(str, start, end) {
609 NodeWithToken.prototype.toString = function() { return this.name; };
611 function parse($TEXT, strict_mode, embed_tokens) {
614 input: tokenizer($TEXT, true),
625 function is(type, value) {
626 return is_token(S.token, type, value);
629 function peek() { return S.peeked || (S.peeked = S.input()); };
646 function croak(msg, line, col, pos) {
647 var ctx = S.input.context();
649 line != null ? line : ctx.tokline,
650 col != null ? col : ctx.tokcol,
651 pos != null ? pos : ctx.tokpos);
654 function token_error(token, msg) {
655 croak(msg, token.line, token.col);
658 function unexpected(token) {
661 token_error(token, "Unexpected token: " + token.type + " (" + token.value + ")");
664 function expect_token(type, val) {
668 token_error(S.token, "Unexpected token " + S.token.type + ", expected " + type);
671 function expect(punc) { return expect_token("punc", punc); };
673 function can_insert_semicolon() {
674 return !strict_mode && (
675 S.token.nlb || is("eof") || is("punc", "}")
679 function semicolon() {
680 if (is("punc", ";")) next();
681 else if (!can_insert_semicolon()) unexpected();
685 return slice(arguments);
688 function parenthesised() {
690 var ex = expression();
695 function add_tokens(str, start, end) {
696 return new NodeWithToken(str, start, end);
699 var statement = embed_tokens ? function() {
701 var stmt = $statement();
702 stmt[0] = add_tokens(stmt[0], start, prev());
706 function $statement() {
707 if (is("operator", "/")) {
709 S.token = S.input(true); // force regexp
711 switch (S.token.type) {
717 return simple_statement();
720 return is_token(peek(), "punc", ":")
721 ? labeled_statement(prog1(S.token.value, next, next))
722 : simple_statement();
725 switch (S.token.value) {
727 return as("block", block_());
730 return simple_statement();
739 switch (prog1(S.token.value, next)) {
741 return break_cont("break");
744 return break_cont("continue");
748 return as("debugger");
751 return (function(body){
752 expect_token("keyword", "while");
753 return as("do", prog1(parenthesised, semicolon), body);
754 })(in_loop(statement));
760 return function_(true);
766 if (S.in_function == 0)
767 croak("'return' outside of function");
771 : can_insert_semicolon()
773 : prog1(expression, semicolon));
776 return as("switch", parenthesised(), switch_block_());
779 return as("throw", prog1(expression, semicolon));
785 return prog1(var_, semicolon);
788 return prog1(const_, semicolon);
791 return as("while", parenthesised(), in_loop(statement));
794 return as("with", parenthesised(), statement());
802 function labeled_statement(label) {
803 S.labels.push(label);
804 var start = S.token, stat = statement();
805 if (strict_mode && !HOP(STATEMENTS_WITH_LABELS, stat[0]))
808 return as("label", label, stat);
811 function simple_statement() {
812 return as("stat", prog1(expression, semicolon));
815 function break_cont(type) {
816 var name = is("name") ? S.token.value : null;
819 if (!member(name, S.labels))
820 croak("Label " + name + " without matching loop or statement");
822 else if (S.in_loop == 0)
823 croak(type + " not inside a loop or switch");
825 return as(type, name);
830 var has_var = is("keyword", "var");
833 if (is("name") && is_token(peek(), "operator", "in")) {
835 var name = S.token.value;
837 var obj = expression();
839 return as("for-in", has_var, name, obj, in_loop(statement));
842 var init = is("punc", ";") ? null : has_var ? var_() : expression();
844 var test = is("punc", ";") ? null : expression();
846 var step = is("punc", ")") ? null : expression();
848 return as("for", init, test, step, in_loop(statement));
852 function function_(in_statement) {
853 var name = is("name") ? prog1(S.token.value, next) : null;
854 if (in_statement && !name)
857 return as(in_statement ? "defun" : "function",
861 while (!is("punc", ")")) {
862 if (first) first = false; else expect(",");
863 if (!is("name")) unexpected();
864 a.push(S.token.value);
873 var loop = S.in_loop;
883 var cond = parenthesised(), body = statement(), belse;
884 if (is("keyword", "else")) {
888 return as("if", cond, body, belse);
894 while (!is("punc", "}")) {
895 if (is("eof")) unexpected();
902 var switch_block_ = curry(in_loop, function(){
904 var a = [], cur = null;
905 while (!is("punc", "}")) {
906 if (is("eof")) unexpected();
907 if (is("keyword", "case")) {
910 a.push([ expression(), cur ]);
913 else if (is("keyword", "default")) {
917 a.push([ null, cur ]);
920 if (!cur) unexpected();
921 cur.push(statement());
929 var body = block_(), bcatch, bfinally;
930 if (is("keyword", "catch")) {
934 croak("Name expected");
935 var name = S.token.value;
938 bcatch = [ name, block_() ];
940 if (is("keyword", "finally")) {
944 if (!bcatch && !bfinally)
945 croak("Missing catch/finally blocks");
946 return as("try", body, bcatch, bfinally);
954 var name = S.token.value;
956 if (is("operator", "=")) {
958 a.push([ name, expression(false) ]);
962 if (!is("punc", ","))
970 return as("var", vardefs());
974 return as("const", vardefs());
978 var newexp = expr_atom(false), args;
979 if (is("punc", "(")) {
981 args = expr_list(")");
985 return subscripts(as("new", newexp, args), true);
988 function expr_atom(allow_calls) {
989 if (is("operator", "new")) {
993 if (is("operator") && HOP(UNARY_PREFIX, S.token.value)) {
994 return make_unary("unary-prefix",
995 prog1(S.token.value, next),
996 expr_atom(allow_calls));
999 switch (S.token.value) {
1002 return subscripts(prog1(expression, curry(expect, ")")), allow_calls);
1005 return subscripts(array_(), allow_calls);
1008 return subscripts(object_(), allow_calls);
1012 if (is("keyword", "function")) {
1014 return subscripts(function_(false), allow_calls);
1016 if (HOP(ATOMIC_START_TOKEN, S.token.type)) {
1017 var atom = S.token.type == "regexp"
1018 ? as("regexp", S.token.value[0], S.token.value[1])
1019 : as(S.token.type, S.token.value);
1020 return subscripts(prog1(atom, next), allow_calls);
1025 function expr_list(closing, allow_trailing_comma) {
1026 var first = true, a = [];
1027 while (!is("punc", closing)) {
1028 if (first) first = false; else expect(",");
1029 if (allow_trailing_comma && is("punc", closing))
1031 a.push(expression(false));
1038 return as("array", expr_list("]", !strict_mode));
1041 function object_() {
1042 var first = true, a = [];
1043 while (!is("punc", "}")) {
1044 if (first) first = false; else expect(",");
1045 if (!strict_mode && is("punc", "}"))
1046 // allow trailing comma
1048 var type = S.token.type;
1049 var name = as_property_name();
1050 if (type == "name" && (name == "get" || name == "set") && !is("punc", ":")) {
1051 a.push([ as_name(), function_(false), name ]);
1054 a.push([ name, expression(false) ]);
1058 return as("object", a);
1061 function as_property_name() {
1062 switch (S.token.type) {
1065 return prog1(S.token.value, next);
1070 function as_name() {
1071 switch (S.token.type) {
1076 return prog1(S.token.value, next);
1082 function subscripts(expr, allow_calls) {
1083 if (is("punc", ".")) {
1085 return subscripts(as("dot", expr, as_name()), allow_calls);
1087 if (is("punc", "[")) {
1089 return subscripts(as("sub", expr, prog1(expression, curry(expect, "]"))), allow_calls);
1091 if (allow_calls && is("punc", "(")) {
1093 return subscripts(as("call", expr, expr_list(")")), true);
1095 if (allow_calls && is("operator") && HOP(UNARY_POSTFIX, S.token.value)) {
1096 return prog1(curry(make_unary, "unary-postfix", S.token.value, expr),
1102 function make_unary(tag, op, expr) {
1103 if ((op == "++" || op == "--") && !is_assignable(expr))
1104 croak("Invalid use of " + op + " operator");
1105 return as(tag, op, expr);
1108 function expr_op(left, min_prec) {
1109 var op = is("operator") ? S.token.value : null;
1110 var prec = op != null ? PRECEDENCE[op] : null;
1111 if (prec != null && prec > min_prec) {
1113 var right = expr_op(expr_atom(true), prec);
1114 return expr_op(as("binary", op, left, right), min_prec);
1119 function expr_ops() {
1120 return expr_op(expr_atom(true), 0);
1123 function maybe_conditional() {
1124 var expr = expr_ops();
1125 if (is("operator", "?")) {
1127 var yes = expression(false);
1129 return as("conditional", expr, yes, expression(false));
1134 function is_assignable(expr) {
1140 return expr[1] != "this";
1144 function maybe_assign() {
1145 var left = maybe_conditional(), val = S.token.value;
1146 if (is("operator") && HOP(ASSIGNMENT, val)) {
1147 if (is_assignable(left)) {
1149 return as("assign", ASSIGNMENT[val], left, maybe_assign());
1151 croak("Invalid assignment");
1156 function expression(commas) {
1157 if (arguments.length == 0)
1159 var expr = maybe_assign();
1160 if (commas && is("punc", ",")) {
1162 return as("seq", expr, expression());
1167 function in_loop(cont) {
1176 return as("toplevel", (function(a){
1178 a.push(statement());
1184 /* -----[ Utilities ]----- */
1187 var args = slice(arguments, 1);
1188 return function() { return f.apply(this, args.concat(slice(arguments))); };
1191 function prog1(ret) {
1192 if (ret instanceof Function)
1194 for (var i = 1, n = arguments.length; --n > 0; ++i)
1199 function array_to_hash(a) {
1201 for (var i = 0; i < a.length; ++i)
1206 function slice(a, start) {
1207 return Array.prototype.slice.call(a, start == null ? 0 : start);
1210 function characters(str) {
1211 return str.split("");
1214 function member(name, array) {
1215 for (var i = array.length; --i >= 0;)
1216 if (array[i] === name)
1221 function HOP(obj, prop) {
1222 return Object.prototype.hasOwnProperty.call(obj, prop);
1225 /* -----[ Exports ]----- */
1227 exports.tokenizer = tokenizer;
1228 exports.parse = parse;
1229 exports.slice = slice;
1230 exports.curry = curry;
1231 exports.member = member;
1232 exports.array_to_hash = array_to_hash;
1233 exports.PRECEDENCE = PRECEDENCE;
1234 exports.KEYWORDS_ATOM = KEYWORDS_ATOM;
1235 exports.RESERVED_WORDS = RESERVED_WORDS;
1236 exports.KEYWORDS = KEYWORDS;
1237 exports.ATOMIC_START_TOKEN = ATOMIC_START_TOKEN;
1238 exports.OPERATORS = OPERATORS;
1239 exports.is_alphanumeric_char = is_alphanumeric_char;