var DEBUG = false; // `true` to print debugging info. var TIMER = false; // `true` to time calls to `lex()` and print the results. var debug = require('./debug')('lex'); exports = module.exports = lex; /** * Convert a CSS string into an array of lexical tokens. * * @param {String} css CSS * @returns {Array} lexical tokens */ function lex(css) { var start; // Debug timer start. var buffer = ''; // Character accumulator var ch; // Current character var column = 0; // Current source column number var cursor = -1; // Current source cursor position var depth = 0; // Current nesting depth var line = 1; // Current source line number var state = 'before-selector'; // Current state var stack = [state]; // State stack var token = {}; // Current token var tokens = []; // Token accumulator // Supported @-rules, in roughly descending order of usage probability. var atRules = [ 'media', 'keyframes', { name: '-webkit-keyframes', type: 'keyframes', prefix: '-webkit-' }, { name: '-moz-keyframes', type: 'keyframes', prefix: '-moz-' }, { name: '-ms-keyframes', type: 'keyframes', prefix: '-ms-' }, { name: '-o-keyframes', type: 'keyframes', prefix: '-o-' }, 'font-face', { name: 'import', state: 'before-at-value' }, { name: 'charset', state: 'before-at-value' }, 'supports', 'viewport', { name: 'namespace', state: 'before-at-value' }, 'document', { name: '-moz-document', type: 'document', prefix: '-moz-' }, 'page' ]; // -- Functions ------------------------------------------------------------ /** * Advance the character cursor and return the next character. * * @returns {String} The next character. */ function getCh() { skip(); return css[cursor]; } /** * Return the state at the given index in the stack. * The stack is LIFO so indexing is from the right. * * @param {Number} [index=0] Index to return. * @returns {String} state */ function getState(index) { return index ? stack[stack.length - 1 - index] : state; } /** * Look ahead for a string beginning from the next position. The string * being looked for must start at the next position. * * @param {String} str The string to look for. * @returns {Boolean} Whether the string was found. */ function isNextString(str) { var start = cursor + 1; return (str === css.slice(start, start + str.length)); } /** * Find the start position of a substring beginning from the next * position. The string being looked for may begin anywhere. * * @param {String} str The substring to look for. * @returns {Number|false} The position, or `false` if not found. */ function find(str) { var pos = css.slice(cursor).indexOf(str); return pos > 0 ? pos : false; } /** * Determine whether a character is next. * * @param {String} ch Character. * @returns {Boolean} Whether the character is next. */ function isNextChar(ch) { return ch === peek(1); } /** * Return the character at the given cursor offset. The offset is relative * to the cursor, so negative values move backwards. * * @param {Number} [offset=1] Cursor offset. * @returns {String} Character. */ function peek(offset) { return css[cursor + (offset || 1)]; } /** * Remove the current state from the stack and set the new current state. * * @returns {String} The removed state. */ function popState() { var removed = stack.pop(); state = stack[stack.length - 1]; return removed; } /** * Set the current state and add it to the stack. * * @param {String} newState The new state. * @returns {Number} The new stack length. */ function pushState(newState) { state = newState; stack.push(state); return stack.length; } /** * Replace the current state with a new state. * * @param {String} newState The new state. * @returns {String} The replaced state. */ function replaceState(newState) { var previousState = state; stack[stack.length - 1] = state = newState; return previousState; } /** * Move the character cursor. Positive numbers move the cursor forward. * Negative numbers are not supported! * * @param {Number} [n=1] Number of characters to skip. */ function skip(n) { if ((n || 1) == 1) { if (css[cursor] == '\n') { line++; column = 1; } else { column++; } cursor++; } else { var skipStr = css.slice(cursor, cursor + n).split('\n'); if (skipStr.length > 1) { line += skipStr.length - 1; column = 1; } column += skipStr[skipStr.length - 1].length; cursor = cursor + n; } } /** * Add the current token to the pile and reset the buffer. */ function addToken() { token.end = { line: line, col: column }; DEBUG && debug('addToken:', JSON.stringify(token, null, 2)); tokens.push(token); buffer = ''; token = {}; } /** * Set the current token. * * @param {String} type Token type. */ function initializeToken(type) { token = { type: type, start: { line: line, col : column } }; } // -- Main Loop ------------------------------------------------------------ /* The main loop is a state machine that reads in one character at a time, and determines what to do based on the current state and character. This is implemented as a series of nested `switch` statements and the case orders have been mildly optimized based on rough probabilities calculated by processing a small sample of real-world CSS. Further optimization (such as a dispatch table) shouldn't be necessary since the total number of cases is very low. */ TIMER && (start = Date.now()); while (ch = getCh()) { DEBUG && debug(ch, getState()); // column += 1; switch (ch) { // Space case ' ': switch (getState()) { case 'selector': case 'value': case 'value-paren': case 'at-group': case 'at-value': case 'comment': case 'double-string': case 'single-string': buffer += ch; break; } break; // Newline or tab case '\n': case '\t': case '\r': case '\f': switch (getState()) { case 'value': case 'value-paren': case 'at-group': case 'comment': case 'single-string': case 'double-string': case 'selector': buffer += ch; break; case 'at-value': // Tokenize an @-rule if a semi-colon was omitted. if ('\n' === ch) { token.value = buffer.trim(); addToken(); popState(); } break; } // if ('\n' === ch) { // column = 0; // line += 1; // } break; case ':': switch (getState()) { case 'name': token.name = buffer.trim(); buffer = ''; replaceState('before-value'); break; case 'before-selector': buffer += ch; initializeToken('selector'); pushState('selector'); break; case 'before-value': replaceState('value'); buffer += ch; break; default: buffer += ch; break; } break; case ';': switch (getState()) { case 'name': case 'before-value': case 'value': // Tokenize a declaration // if value is empty skip the declaration if (buffer.trim().length > 0) { token.value = buffer.trim(), addToken(); } replaceState('before-name'); break; case 'value-paren': // Insignificant semi-colon buffer += ch; break; case 'at-value': // Tokenize an @-rule token.value = buffer.trim(); addToken(); popState(); break; case 'before-name': // Extraneous semi-colon break; default: buffer += ch; break; } break; case '{': switch (getState()) { case 'selector': // If the sequence is `\{` then assume that the brace should be escaped. if (peek(-1) === '\\') { buffer += ch; break; } // Tokenize a selector token.text = buffer.trim(); addToken(); replaceState('before-name'); depth = depth + 1; break; case 'at-group': // Tokenize an @-group token.name = buffer.trim(); // XXX: @-rules are starting to get hairy switch (token.type) { case 'font-face': case 'viewport' : case 'page' : pushState('before-name'); break; default: pushState('before-selector'); } addToken(); depth = depth + 1; break; case 'name': case 'at-rule': // Tokenize a declaration or an @-rule token.name = buffer.trim(); addToken(); pushState('before-name'); depth = depth + 1; break; case 'comment': case 'double-string': case 'single-string': // Ignore braces in comments and strings buffer += ch; break; case 'before-value': replaceState('value'); buffer += ch; break; } break; case '}': switch (getState()) { case 'before-name': case 'name': case 'before-value': case 'value': // If the buffer contains anything, it is a value if (buffer) { token.value = buffer.trim(); } // If the current token has a name and a value it should be tokenized. if (token.name && token.value) { addToken(); } // Leave the block initializeToken('end'); addToken(); popState(); // We might need to leave again. // XXX: What about 3 levels deep? if ('at-group' === getState()) { initializeToken('at-group-end'); addToken(); popState(); } if (depth > 0) { depth = depth - 1; } break; case 'at-group': case 'before-selector': case 'selector': // If the sequence is `\}` then assume that the brace should be escaped. if (peek(-1) === '\\') { buffer += ch; break; } if (depth > 0) { // Leave block if in an at-group if ('at-group' === getState(1)) { initializeToken('at-group-end'); addToken(); } } if (depth > 1) { popState(); } if (depth > 0) { depth = depth - 1; } break; case 'double-string': case 'single-string': case 'comment': // Ignore braces in comments and strings. buffer += ch; break; } break; // Strings case '"': case "'": switch (getState()) { case 'double-string': if ('"' === ch && '\\' !== peek(-1)) { popState(); } break; case 'single-string': if ("'" === ch && '\\' !== peek(-1)) { popState(); } break; case 'before-at-value': replaceState('at-value'); pushState('"' === ch ? 'double-string' : 'single-string'); break; case 'before-value': replaceState('value'); pushState('"' === ch ? 'double-string' : 'single-string'); break; case 'comment': // Ignore strings within comments. break; default: if ('\\' !== peek(-1)) { pushState('"' === ch ? 'double-string' : 'single-string'); } } buffer += ch; break; // Comments case '/': switch (getState()) { case 'comment': case 'double-string': case 'single-string': // Ignore buffer += ch; break; case 'before-value': case 'selector': case 'name': case 'value': if (isNextChar('*')) { // Ignore comments in selectors, properties and values. They are // difficult to represent in the AST. var pos = find('*/'); if (pos) { skip(pos + 1); } } else { if (getState() == 'before-value') replaceState('value'); buffer += ch; } break; default: if (isNextChar('*')) { // Create a comment token initializeToken('comment'); pushState('comment'); skip(); } else { buffer += ch; } break; } break; // Comment end or universal selector case '*': switch (getState()) { case 'comment': if (isNextChar('/')) { // Tokenize a comment token.text = buffer; // Don't trim()! skip(); addToken(); popState(); } else { buffer += ch; } break; case 'before-selector': buffer += ch; initializeToken('selector'); pushState('selector'); break; case 'before-value': replaceState('value'); buffer += ch; break; default: buffer += ch; } break; // @-rules case '@': switch (getState()) { case 'comment': case 'double-string': case 'single-string': buffer += ch; break; case 'before-value': replaceState('value'); buffer += ch; break; default: // Iterate over the supported @-rules and attempt to tokenize one. var tokenized = false; var name; var rule; for (var j = 0, len = atRules.length; !tokenized && j < len; ++j) { rule = atRules[j]; name = rule.name || rule; if (!isNextString(name)) { continue; } tokenized = true; initializeToken(name); pushState(rule.state || 'at-group'); skip(name.length); if (rule.prefix) { token.prefix = rule.prefix; } if (rule.type) { token.type = rule.type; } } if (!tokenized) { // Keep on truckin' America! buffer += ch; } break; } break; // Parentheses are tracked to disambiguate semi-colons, such as within a // data URI. case '(': switch (getState()) { case 'value': pushState('value-paren'); break; case 'before-value': replaceState('value'); break; } buffer += ch; break; case ')': switch (getState()) { case 'value-paren': popState(); break; case 'before-value': replaceState('value'); break; } buffer += ch; break; default: switch (getState()) { case 'before-selector': initializeToken('selector'); pushState('selector'); break; case 'before-name': initializeToken('property'); replaceState('name'); break; case 'before-value': replaceState('value'); break; case 'before-at-value': replaceState('at-value'); break; } buffer += ch; break; } } TIMER && debug('ran in', (Date.now() - start) + 'ms'); return tokens; }