/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved. Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "HTMLTokenizer.h" #include "CSSHelper.h" #include "Cache.h" #include "CachedScript.h" #include "DocLoader.h" #include "DocumentFragment.h" #include "EventNames.h" #include "Frame.h" #include "FrameLoader.h" #include "FrameView.h" #include "HTMLElement.h" #include "HTMLNames.h" #include "HTMLParser.h" #include "HTMLScriptElement.h" #include "HTMLViewSourceDocument.h" #include "Settings.h" #include "SystemTime.h" #include "kjs_proxy.h" #include #include "HTMLEntityNames.c" // #define INSTRUMENT_LAYOUT_SCHEDULING 1 #if MOBILE // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced. // This value is used to define how many characters the tokenizer will process before // yeilding control. #define TOKENIZER_CHUNK_SIZE 256 #else #define TOKENIZER_CHUNK_SIZE 4096 #endif using namespace std; using namespace WTF; namespace WebCore { using namespace HTMLNames; using namespace EventNames; #if MOBILE // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise // it will take way to long to load a page. const double tokenizerTimeDelay = 0.300; #else // FIXME: We would like this constant to be 200ms. // Yielding more aggressively results in increased responsiveness and better incremental rendering. // It slows down overall page-load on slower machines, though, so for now we set a value of 500. const double tokenizerTimeDelay = 0.500; #endif static const char commentStart [] = " as a close comment, even though it's // not technically valid. endCharsCount = 4; } if (handleBrokenComments || endCharsCount > 1) { src.advance(m_lineNumber); if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) { checkScriptBuffer(); scriptCode[scriptCodeSize] = 0; scriptCode[scriptCodeSize + 1] = 0; currToken.tagName = commentAtom; currToken.beginTag = true; state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state); processToken(); currToken.tagName = commentAtom; currToken.beginTag = false; processToken(); scriptCodeSize = 0; } state.setInComment(false); return state; // Finished parsing comment } } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state) { checkScriptBuffer(src.length()); while (!src.isEmpty()) { scriptCode[scriptCodeSize++] = *src; if (*src == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') { src.advance(m_lineNumber); state.setInServer(false); scriptCodeSize = 0; return state; // Finished parsing server include } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state) { UChar oldchar = 0; while (!src.isEmpty()) { UChar chbegin = *src; if (chbegin == '\'') tquote = tquote == SingleQuote ? NoQuote : SingleQuote; else if (chbegin == '\"') tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; // Look for '?>' // Some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if (chbegin == '>' && (!tquote || oldchar == '?')) { // We got a '?>' sequence state.setInProcessingInstruction(false); src.advance(m_lineNumber); state.setDiscardLF(true); return state; // Finished parsing comment! } src.advance(m_lineNumber); oldchar = chbegin; } return state; } HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state) { while (!src.isEmpty()) { UChar cc = *src; if (state.skipLF()) { state.setSkipLF(false); if (cc == '\n') { src.advance(m_lineNumber); continue; } } // do we need to enlarge the buffer? checkBuffer(); if (cc == '\r') { state.setSkipLF(true); *dest++ = '\n'; } else *dest++ = cc; src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, UChar*& dest, State state, unsigned &cBufferPos, bool start, bool parsingTag) { if (start) { cBufferPos = 0; state.setEntityState(SearchEntity); EntityUnicodeValue = 0; } while(!src.isEmpty()) { UChar cc = *src; switch(state.entityState()) { case NoEntity: ASSERT(state.entityState() != NoEntity); return state; case SearchEntity: if(cc == '#') { cBuffer[cBufferPos++] = cc; src.advance(m_lineNumber); state.setEntityState(NumericSearch); } else state.setEntityState(EntityName); break; case NumericSearch: if (cc == 'x' || cc == 'X') { cBuffer[cBufferPos++] = cc; src.advance(m_lineNumber); state.setEntityState(Hexadecimal); } else if (cc >= '0' && cc <= '9') state.setEntityState(Decimal); else state.setEntityState(SearchSemicolon); break; case Hexadecimal: { int ll = min(src.length(), 10 - cBufferPos); while (ll--) { cc = *src; if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { state.setEntityState(SearchSemicolon); break; } int digit; if (cc < 'A') digit = cc - '0'; else digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch EntityUnicodeValue = EntityUnicodeValue * 16 + digit; cBuffer[cBufferPos++] = cc; src.advance(m_lineNumber); } if (cBufferPos == 10) state.setEntityState(SearchSemicolon); break; } case Decimal: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!(cc >= '0' && cc <= '9')) { state.setEntityState(SearchSemicolon); break; } EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); cBuffer[cBufferPos++] = cc; src.advance(m_lineNumber); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); break; } case EntityName: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { state.setEntityState(SearchSemicolon); break; } cBuffer[cBufferPos++] = cc; src.advance(m_lineNumber); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); if (state.entityState() == SearchSemicolon) { if(cBufferPos > 1) { const Entity *e = findEntity(cBuffer, cBufferPos); if(e) EntityUnicodeValue = e->code; // be IE compatible if(parsingTag && EntityUnicodeValue > 255 && *src != ';') EntityUnicodeValue = 0; } } else break; } case SearchSemicolon: // Don't allow values that are more than 21 bits. if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { if (!inViewSourceMode()) { if (*src == ';') src.advance(m_lineNumber); if (EntityUnicodeValue <= 0xFFFF) { checkBuffer(); src.push(fixUpChar(EntityUnicodeValue)); } else { // Convert to UTF-16, using surrogate code points. checkBuffer(2); src.push(U16_LEAD(EntityUnicodeValue)); src.push(U16_TRAIL(EntityUnicodeValue)); } } else { // FIXME: We should eventually colorize entities by sending them as a special token. checkBuffer(11); *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = cBuffer[i]; dest += cBufferPos; if (*src == ';') { *dest++ = ';'; src.advance(m_lineNumber); } } } else { checkBuffer(10); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = cBuffer[i]; dest += cBufferPos; } state.setEntityState(NoEntity); return state; } } return state; } HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state) { ASSERT(!state.hasEntityState()); unsigned cBufferPos = m_cBufferPos; bool lastIsSlash = false; while (!src.isEmpty()) { checkBuffer(); switch(state.tagState()) { case NoTag: { m_cBufferPos = cBufferPos; return state; } case TagName: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("TagName"); #endif if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 4) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Found comment" << endl; #endif // Found ' as a valid comment, since both mozilla and IE on windows // can handle this case. Only do this in quirks mode. -dwh if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) { state.setInComment(false); src.advance(m_lineNumber); if (!src.isEmpty()) // cuts off high bits, which is okay cBuffer[cBufferPos++] = *src; } else state = parseComment(src, state); m_cBufferPos = cBufferPos; return state; // Finished parsing tag! } // cuts off high bits, which is okay cBuffer[cBufferPos++] = *src; src.advance(m_lineNumber); break; } else searchCount = 0; // Stop looking for ' searchCount = 1; // Look for ' " << name << endl; if (currToken.flat) kdDebug( 6036 ) << "Token is FLAT!" << endl; if(!text.isNull()) kdDebug( 6036 ) << "text: \"" << text << "\"" << endl; unsigned l = currToken.attrs ? currToken.attrs->length() : 0; if(l) { kdDebug( 6036 ) << "Attributes: " << l << endl; for (unsigned i = 0; i < l; ++i) { Attribute* c = currToken.attrs->attributeItem(i); kdDebug( 6036 ) << " " << c->localName().deprecatedString() << "=\"" << c->value().deprecatedString() << "\"" << endl; } } kdDebug( 6036 ) << endl; #endif RefPtr n; if (!m_parserStopped) { if (inViewSourceMode()) static_cast(m_doc)->addViewSourceToken(&currToken); else // pass the token over to the parser, the parser DOES NOT delete the token n = parser->parseToken(&currToken); } currToken.reset(); if (jsProxy) jsProxy->setEventHandlerLineno(0); return n.release(); } HTMLTokenizer::~HTMLTokenizer() { ASSERT(!inWrite); reset(); delete parser; } void HTMLTokenizer::enlargeBuffer(int len) { int newSize = max(size * 2, size + len); int oldOffset = dest - buffer; buffer = static_cast(fastRealloc(buffer, newSize * sizeof(UChar))); dest = buffer + oldOffset; size = newSize; } void HTMLTokenizer::enlargeScriptBuffer(int len) { int newSize = max(scriptCodeMaxSize * 2, scriptCodeMaxSize + len); scriptCode = static_cast(fastRealloc(scriptCode, newSize * sizeof(UChar))); scriptCodeMaxSize = newSize; } void HTMLTokenizer::executeScriptsWaitingForStylesheets() { ASSERT(m_doc->haveStylesheetsLoaded()); if (m_hasScriptsWaitingForStylesheets) notifyFinished(0); } void HTMLTokenizer::notifyFinished(CachedResource*) { #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("script loaded at %d\n", m_doc->elapsedTime()); #endif ASSERT(!pendingScripts.isEmpty()); // Make scripts loaded from file URLs wait for stylesheets to match Tiger behavior where // file loads were serialized in lower level. // FIXME: this should really be done for all script loads or the same effect should be achieved by other // means, like javascript suspend/resume m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded() && pendingScripts.head()->url().startsWith("file:", false); if (m_hasScriptsWaitingForStylesheets) return; bool finished = false; while (!finished && pendingScripts.head()->isLoaded()) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Finished loading an external script" << endl; #endif CachedScript* cs = pendingScripts.dequeue(); ASSERT(cache()->disabled() || cs->accessCount() > 0); String scriptSource = cs->script(); #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "External script is:" << endl << scriptSource.deprecatedString() << endl; #endif setSrc(SegmentedString()); // make sure we forget about the script before we execute the new one // infinite recursion might happen otherwise DeprecatedString cachedScriptUrl( cs->url().deprecatedString() ); bool errorOccurred = cs->errorOccurred(); cs->deref(this); RefPtr n = scriptNode.release(); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("external script beginning execution at %d\n", m_doc->elapsedTime()); #endif if (errorOccurred) EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, true, false); else { if (static_cast(n.get())->shouldExecuteAsJavaScript()) m_state = scriptExecution(scriptSource.deprecatedString(), m_state, cachedScriptUrl); EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false); } // The state of pendingScripts.isEmpty() can change inside the scriptExecution() // call above, so test afterwards. finished = pendingScripts.isEmpty(); if (finished) { m_state.setLoadingExtScript(false); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("external script finished execution at %d\n", m_doc->elapsedTime()); #endif } // 'm_requestingScript' is true when we are called synchronously from // scriptHandler(). In that case scriptHandler() will take care // of pendingSrc. if (!m_requestingScript) { SegmentedString rest = pendingSrc; pendingSrc.clear(); write(rest, false); // we might be deleted at this point, do not // access any members. } } } bool HTMLTokenizer::isWaitingForScripts() const { return m_state.loadingExtScript(); } void HTMLTokenizer::setSrc(const SegmentedString &source) { src = source; } void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment) { HTMLTokenizer tok(fragment); tok.setForceSynchronous(true); tok.write(source, true); tok.finish(); ASSERT(!tok.processingData()); // make sure we're done (see 3963151) } UChar decodeNamedEntity(const char* name) { const Entity* e = findEntity(name, strlen(name)); return e ? e->code : 0; } }