// -*- c-basic-offset: 2 -*- /* * This file is part of the KDE libraries * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) * Copyright (C) 2006 Apple Computer, Inc. * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "config.h" #include "lexer.h" #include #include #include "function.h" #include "interpreter.h" #include "nodes.h" #include using namespace WTF; using namespace Unicode; // we can't specify the namespace in yacc's C output, so do it here using namespace KJS; #ifndef KDE_USE_FINAL #include "grammar.h" #endif #include "lookup.h" #include "lexer.lut.h" extern YYLTYPE kjsyylloc; // global bison variable holding token info // a bridge for yacc from the C world to C++ int kjsyylex() { return Lexer::curr()->lex(); } namespace KJS { static Lexer* currLexer = 0; static bool isDecimalDigit(int); Lexer::Lexer() : yylineno(1), size8(128), size16(128), restrKeyword(false), eatNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0), code(0), length(0), #ifndef KJS_PURE_ECMA bol(true), #endif current(0), next1(0), next2(0), next3(0), strings(0), numStrings(0), stringsCapacity(0), identifiers(0), numIdentifiers(0), identifiersCapacity(0) { // allocate space for read buffers buffer8 = new char[size8]; buffer16 = new KJS::UChar[size16]; currLexer = this; } Lexer::~Lexer() { doneParsing(); delete [] buffer8; delete [] buffer16; } Lexer *Lexer::curr() { if (!currLexer) { // create singleton instance currLexer = new Lexer(); } return currLexer; } #ifdef KJS_DEBUG_MEM void Lexer::globalClear() { delete currLexer; currLexer = 0L; } #endif void Lexer::setCode(const UString &sourceURL, int startingLineNumber, const KJS::UChar *c, unsigned int len) { yylineno = 1 + startingLineNumber; m_sourceURL = sourceURL; restrKeyword = false; delimited = false; eatNextIdentifier = false; stackToken = -1; lastToken = -1; pos = 0; code = c; length = len; skipLF = false; skipCR = false; error = false; #ifndef KJS_PURE_ECMA bol = true; #endif // read first characters current = (length > 0) ? code[0].uc : -1; next1 = (length > 1) ? code[1].uc : -1; next2 = (length > 2) ? code[2].uc : -1; next3 = (length > 3) ? code[3].uc : -1; } void Lexer::shift(unsigned int p) { // Here would be a good place to strip Cf characters, but that has caused compatibility problems: // . while (p--) { pos++; current = next1; next1 = next2; next2 = next3; next3 = (pos + 3 < length) ? code[pos + 3].uc : -1; } } // called on each new line void Lexer::nextLine() { yylineno++; #ifndef KJS_PURE_ECMA bol = true; #endif } void Lexer::setDone(State s) { state = s; done = true; } int Lexer::lex() { int token = 0; state = Start; unsigned short stringType = 0; // either single or double quotes pos8 = pos16 = 0; done = false; terminator = false; skipLF = false; skipCR = false; // did we push a token on the stack previously ? // (after an automatic semicolon insertion) if (stackToken >= 0) { setDone(Other); token = stackToken; stackToken = 0; } while (!done) { if (skipLF && current != '\n') // found \r but not \n afterwards skipLF = false; if (skipCR && current != '\r') // found \n but not \r afterwards skipCR = false; if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one { skipLF = false; skipCR = false; shift(1); } switch (state) { case Start: if (isWhiteSpace()) { // do nothing } else if (current == '/' && next1 == '/') { shift(1); state = InSingleLineComment; } else if (current == '/' && next1 == '*') { shift(1); state = InMultiLineComment; } else if (current == -1) { if (!terminator && !delimited) { // automatic semicolon insertion if program incomplete token = ';'; stackToken = 0; setDone(Other); } else setDone(Eof); } else if (isLineTerminator()) { nextLine(); terminator = true; if (restrKeyword) { token = ';'; setDone(Other); } } else if (current == '"' || current == '\'') { state = InString; stringType = static_cast(current); } else if (isIdentStart(current)) { record16(current); state = InIdentifierOrKeyword; } else if (current == '\\') { state = InIdentifierUnicodeEscapeStart; } else if (current == '0') { record8(current); state = InNum0; } else if (isDecimalDigit(current)) { record8(current); state = InNum; } else if (current == '.' && isDecimalDigit(next1)) { record8(current); state = InDecimal; #ifndef KJS_PURE_ECMA // } else if (bol && current == '-' && next1 == '-' && next2 == '>') { shift(2); state = InSingleLineComment; #endif } else { token = matchPunctuator(current, next1, next2, next3); if (token != -1) { setDone(Other); } else { // cerr << "encountered unknown character" << endl; setDone(Bad); } } break; case InString: if (current == stringType) { shift(1); setDone(String); } else if (isLineTerminator() || current == -1) { setDone(Bad); } else if (current == '\\') { state = InEscapeSequence; } else { record16(current); } break; // Escape Sequences inside of strings case InEscapeSequence: if (isOctalDigit(current)) { if (current >= '0' && current <= '3' && isOctalDigit(next1) && isOctalDigit(next2)) { record16(convertOctal(current, next1, next2)); shift(2); state = InString; } else if (isOctalDigit(current) && isOctalDigit(next1)) { record16(convertOctal('0', current, next1)); shift(1); state = InString; } else if (isOctalDigit(current)) { record16(convertOctal('0', '0', current)); state = InString; } else { setDone(Bad); } } else if (current == 'x') state = InHexEscape; else if (current == 'u') state = InUnicodeEscape; else if (isLineTerminator()) { nextLine(); state = InString; } else { record16(singleEscape(static_cast(current))); state = InString; } break; case InHexEscape: if (isHexDigit(current) && isHexDigit(next1)) { state = InString; record16(convertHex(current, next1)); shift(1); } else if (current == stringType) { record16('x'); shift(1); setDone(String); } else { record16('x'); record16(current); state = InString; } break; case InUnicodeEscape: if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) { record16(convertUnicode(current, next1, next2, next3)); shift(3); state = InString; } else if (current == stringType) { record16('u'); shift(1); setDone(String); } else { setDone(Bad); } break; case InSingleLineComment: if (isLineTerminator()) { nextLine(); terminator = true; if (restrKeyword) { token = ';'; setDone(Other); } else state = Start; } else if (current == -1) { setDone(Eof); } break; case InMultiLineComment: if (current == -1) { setDone(Bad); } else if (isLineTerminator()) { nextLine(); } else if (current == '*' && next1 == '/') { state = Start; shift(1); } break; case InIdentifierOrKeyword: case InIdentifier: if (isIdentPart(current)) record16(current); else if (current == '\\') state = InIdentifierUnicodeEscapeStart; else setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier); break; case InNum0: if (current == 'x' || current == 'X') { record8(current); state = InHex; } else if (current == '.') { record8(current); state = InDecimal; } else if (current == 'e' || current == 'E') { record8(current); state = InExponentIndicator; } else if (isOctalDigit(current)) { record8(current); state = InOctal; } else if (isDecimalDigit(current)) { record8(current); state = InDecimal; } else { setDone(Number); } break; case InHex: if (isHexDigit(current)) { record8(current); } else { setDone(Hex); } break; case InOctal: if (isOctalDigit(current)) { record8(current); } else if (isDecimalDigit(current)) { record8(current); state = InDecimal; } else setDone(Octal); break; case InNum: if (isDecimalDigit(current)) { record8(current); } else if (current == '.') { record8(current); state = InDecimal; } else if (current == 'e' || current == 'E') { record8(current); state = InExponentIndicator; } else setDone(Number); break; case InDecimal: if (isDecimalDigit(current)) { record8(current); } else if (current == 'e' || current == 'E') { record8(current); state = InExponentIndicator; } else setDone(Number); break; case InExponentIndicator: if (current == '+' || current == '-') { record8(current); } else if (isDecimalDigit(current)) { record8(current); state = InExponent; } else setDone(Bad); break; case InExponent: if (isDecimalDigit(current)) { record8(current); } else setDone(Number); break; case InIdentifierUnicodeEscapeStart: if (current == 'u') state = InIdentifierUnicodeEscape; else setDone(Bad); break; case InIdentifierUnicodeEscape: if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) { record16(convertUnicode(current, next1, next2, next3)); shift(3); state = InIdentifier; } else { setDone(Bad); } break; default: assert(!"Unhandled state in switch statement"); } // move on to the next character if (!done) shift(1); #ifndef KJS_PURE_ECMA if (state != Start && state != InSingleLineComment) bol = false; #endif } // no identifiers allowed directly after numeric literal, e.g. "3in" is bad if ((state == Number || state == Octal || state == Hex) && isIdentStart(current)) state = Bad; // terminate string buffer8[pos8] = '\0'; #ifdef KJS_DEBUG_LEX fprintf(stderr, "line: %d ", lineNo()); fprintf(stderr, "yytext (%x): ", buffer8[0]); fprintf(stderr, "%s ", buffer8); #endif double dval = 0; if (state == Number) { dval = strtod(buffer8, 0L); } else if (state == Hex) { // scan hex numbers const char *p = buffer8 + 2; while (char c = *p++) { dval *= 16; dval += convertHex(c); } if (dval >= mantissaOverflowLowerBound) dval = parseIntOverflow(buffer8 + 2, p - (buffer8 + 3), 16); state = Number; } else if (state == Octal) { // scan octal number const char *p = buffer8 + 1; while (char c = *p++) { dval *= 8; dval += c - '0'; } if (dval >= mantissaOverflowLowerBound) dval = parseIntOverflow(buffer8 + 1, p - (buffer8 + 2), 8); state = Number; } #ifdef KJS_DEBUG_LEX switch (state) { case Eof: printf("(EOF)\n"); break; case Other: printf("(Other)\n"); break; case Identifier: printf("(Identifier)/(Keyword)\n"); break; case String: printf("(String)\n"); break; case Number: printf("(Number)\n"); break; default: printf("(unknown)"); } #endif if (state != Identifier && eatNextIdentifier) eatNextIdentifier = false; restrKeyword = false; delimited = false; kjsyylloc.first_line = yylineno; // ??? kjsyylloc.last_line = yylineno; switch (state) { case Eof: token = 0; break; case Other: if(token == '}' || token == ';') { delimited = true; } break; case IdentifierOrKeyword: if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) { case Identifier: // Lookup for keyword failed, means this is an identifier // Apply anonymous-function hack below (eat the identifier) if (eatNextIdentifier) { eatNextIdentifier = false; token = lex(); break; } kjsyylval.ident = makeIdentifier(buffer16, pos16); token = IDENT; break; } eatNextIdentifier = false; // Hack for "f = function somename() { ... }", too hard to get into the grammar if (token == FUNCTION && lastToken == '=' ) eatNextIdentifier = true; if (token == CONTINUE || token == BREAK || token == RETURN || token == THROW) restrKeyword = true; break; case String: kjsyylval.ustr = makeUString(buffer16, pos16); token = STRING; break; case Number: kjsyylval.dval = dval; token = NUMBER; break; case Bad: #ifdef KJS_DEBUG_LEX fprintf(stderr, "yylex: ERROR.\n"); #endif error = true; return -1; default: assert(!"unhandled numeration value in switch"); error = true; return -1; } lastToken = token; return token; } bool Lexer::isWhiteSpace() const { return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current); } bool Lexer::isLineTerminator() { bool cr = (current == '\r'); bool lf = (current == '\n'); if (cr) skipLF = true; else if (lf) skipCR = true; return cr || lf || current == 0x2028 || current == 0x2029; } bool Lexer::isIdentStart(int c) { return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other)) || c == '$' || c == '_'; } bool Lexer::isIdentPart(int c) { return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)) || c == '$' || c == '_'; } static bool isDecimalDigit(int c) { return (c >= '0' && c <= '9'); } bool Lexer::isHexDigit(int c) { return (c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'); } bool Lexer::isOctalDigit(int c) { return (c >= '0' && c <= '7'); } int Lexer::matchPunctuator(int c1, int c2, int c3, int c4) { if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') { shift(4); return URSHIFTEQUAL; } else if (c1 == '=' && c2 == '=' && c3 == '=') { shift(3); return STREQ; } else if (c1 == '!' && c2 == '=' && c3 == '=') { shift(3); return STRNEQ; } else if (c1 == '>' && c2 == '>' && c3 == '>') { shift(3); return URSHIFT; } else if (c1 == '<' && c2 == '<' && c3 == '=') { shift(3); return LSHIFTEQUAL; } else if (c1 == '>' && c2 == '>' && c3 == '=') { shift(3); return RSHIFTEQUAL; } else if (c1 == '<' && c2 == '=') { shift(2); return LE; } else if (c1 == '>' && c2 == '=') { shift(2); return GE; } else if (c1 == '!' && c2 == '=') { shift(2); return NE; } else if (c1 == '+' && c2 == '+') { shift(2); if (terminator) return AUTOPLUSPLUS; else return PLUSPLUS; } else if (c1 == '-' && c2 == '-') { shift(2); if (terminator) return AUTOMINUSMINUS; else return MINUSMINUS; } else if (c1 == '=' && c2 == '=') { shift(2); return EQEQ; } else if (c1 == '+' && c2 == '=') { shift(2); return PLUSEQUAL; } else if (c1 == '-' && c2 == '=') { shift(2); return MINUSEQUAL; } else if (c1 == '*' && c2 == '=') { shift(2); return MULTEQUAL; } else if (c1 == '/' && c2 == '=') { shift(2); return DIVEQUAL; } else if (c1 == '&' && c2 == '=') { shift(2); return ANDEQUAL; } else if (c1 == '^' && c2 == '=') { shift(2); return XOREQUAL; } else if (c1 == '%' && c2 == '=') { shift(2); return MODEQUAL; } else if (c1 == '|' && c2 == '=') { shift(2); return OREQUAL; } else if (c1 == '<' && c2 == '<') { shift(2); return LSHIFT; } else if (c1 == '>' && c2 == '>') { shift(2); return RSHIFT; } else if (c1 == '&' && c2 == '&') { shift(2); return AND; } else if (c1 == '|' && c2 == '|') { shift(2); return OR; } switch(c1) { case '=': case '>': case '<': case ',': case '!': case '~': case '?': case ':': case '.': case '+': case '-': case '*': case '/': case '&': case '|': case '^': case '%': case '(': case ')': case '{': case '}': case '[': case ']': case ';': shift(1); return static_cast(c1); default: return -1; } } unsigned short Lexer::singleEscape(unsigned short c) { switch(c) { case 'b': return 0x08; case 't': return 0x09; case 'n': return 0x0A; case 'v': return 0x0B; case 'f': return 0x0C; case 'r': return 0x0D; case '"': return 0x22; case '\'': return 0x27; case '\\': return 0x5C; default: return c; } } unsigned short Lexer::convertOctal(int c1, int c2, int c3) { return static_cast((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0'); } unsigned char Lexer::convertHex(int c) { if (c >= '0' && c <= '9') return static_cast(c - '0'); if (c >= 'a' && c <= 'f') return static_cast(c - 'a' + 10); return static_cast(c - 'A' + 10); } unsigned char Lexer::convertHex(int c1, int c2) { return ((convertHex(c1) << 4) + convertHex(c2)); } KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4) { return KJS::UChar((convertHex(c1) << 4) + convertHex(c2), (convertHex(c3) << 4) + convertHex(c4)); } void Lexer::record8(int c) { ASSERT(c >= 0); ASSERT(c <= 0xff); // enlarge buffer if full if (pos8 >= size8 - 1) { char *tmp = new char[2 * size8]; memcpy(tmp, buffer8, size8 * sizeof(char)); delete [] buffer8; buffer8 = tmp; size8 *= 2; } buffer8[pos8++] = (char) c; } void Lexer::record16(int c) { ASSERT(c >= 0); ASSERT(c <= USHRT_MAX); record16(UChar(static_cast(c))); } void Lexer::record16(KJS::UChar c) { // enlarge buffer if full if (pos16 >= size16 - 1) { KJS::UChar *tmp = new KJS::UChar[2 * size16]; memcpy(tmp, buffer16, size16 * sizeof(KJS::UChar)); delete [] buffer16; buffer16 = tmp; size16 *= 2; } buffer16[pos16++] = c; } bool Lexer::scanRegExp() { pos16 = 0; bool lastWasEscape = false; bool inBrackets = false; while (1) { if (isLineTerminator() || current == -1) return false; else if (current != '/' || lastWasEscape == true || inBrackets == true) { // keep track of '[' and ']' if (!lastWasEscape) { if ( current == '[' && !inBrackets ) inBrackets = true; if ( current == ']' && inBrackets ) inBrackets = false; } record16(current); lastWasEscape = !lastWasEscape && (current == '\\'); } else { // end of regexp pattern = UString(buffer16, pos16); pos16 = 0; shift(1); break; } shift(1); } while (isIdentPart(current)) { record16(current); shift(1); } flags = UString(buffer16, pos16); return true; } void Lexer::doneParsing() { for (unsigned i = 0; i < numIdentifiers; i++) { delete identifiers[i]; } fastFree(identifiers); identifiers = 0; numIdentifiers = 0; identifiersCapacity = 0; for (unsigned i = 0; i < numStrings; i++) { delete strings[i]; } fastFree(strings); strings = 0; numStrings = 0; stringsCapacity = 0; } const int initialCapacity = 64; const int growthFactor = 2; // FIXME: this completely ignores its parameters, instead using buffer16 and pos16 - wtf? Identifier *Lexer::makeIdentifier(KJS::UChar*, unsigned int) { if (numIdentifiers == identifiersCapacity) { identifiersCapacity = (identifiersCapacity == 0) ? initialCapacity : identifiersCapacity *growthFactor; identifiers = (KJS::Identifier **)fastRealloc(identifiers, sizeof(KJS::Identifier *) * identifiersCapacity); } KJS::Identifier *identifier = new KJS::Identifier(buffer16, pos16); identifiers[numIdentifiers++] = identifier; return identifier; } // FIXME: this completely ignores its parameters, instead using buffer16 and pos16 - wtf? UString *Lexer::makeUString(KJS::UChar*, unsigned int) { if (numStrings == stringsCapacity) { stringsCapacity = (stringsCapacity == 0) ? initialCapacity : stringsCapacity *growthFactor; strings = (UString **)fastRealloc(strings, sizeof(UString *) * stringsCapacity); } UString *string = new UString(buffer16, pos16); strings[numStrings++] = string; return string; } }