TokenizerBase.java

/*
 * Copyright (c) 2015-2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.lexer;

import net.morimekta.strings.ConsoleUtil;
import net.morimekta.strings.io.LineBufferedReader;

import java.io.IOException;
import java.io.Reader;

import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.EscapeUtil.javaEscape;

/**
 * Base tokenizer used around providence. The base tokenizer supports the
 * minimum of what all the tokenizer implementations should require, so that
 * each will mostly expand support, not much reduce it.
 *
 * @param <TT> TokenType type.
 * @param <T> Token type.
 */
public abstract class TokenizerBase<TT, T extends Token<TT>>
        extends LineBufferedReader
        implements Tokenizer<TT, T> {
    /** Default buffer size, if not specified: 2048 chars / 4 kB. */
    public static final int DEFAULT_BUFFER_SIZE = 1 << 11; // 2048 chars --> 4kB

    /**
     * Create a tokenizer instance.
     *
     * @param in Reader to read.
     * @param bufferSize The size of the read-buffer.
     * @param preLoadAll If the whole content of the reader should be pre-loaded.
     *                   If this is true, then all token content can be considered
     *                   immutable.
     */
    protected TokenizerBase(Reader in,
                            int bufferSize,
                            boolean preLoadAll) {
        super(in, bufferSize, preLoadAll);
        // If the line is longer than 16k, it will not be used in error messages.
    }

    /**
     * Creates a lexer exception from a token with a message. This is an overridable
     * method in order to be able to change the exception class from the get-go in
     * the tokenizer.
     *
     * @param token The token to cause the exception.
     * @param message The exception message.
     * @param params If provided will be used with {@link String#format(String, Object...)} to
     *               make a readable exception message.
     * @return The resulting exception.
     */
    public LexerException failure(T token,
                                  String message,
                                  Object... params) {
        if (params.length > 0) {
            message = String.format(message, params);
        }
        return new LexerException(token, message);
    }

    /**
     * Creates an EOF lexer exception from a token with a message. This is an overridable
     * method in order to be able to change the exception class from the get-go in
     * the tokenizer. This is similar to {@link #failure(Token, String, Object...)} but
     * does not provide a token, and assumes the tokenizer is at the end of the stream.
     *
     * @param message The exception message.
     * @param params If provided will be used with {@link String#format(String, Object...)} to
     *               make a readable exception message.
     * @return The resulting exception.
     */
    protected LexerException eofFailure(String message,
                                        Object... params) {
        if (params.length > 0) {
            message = String.format(message, params);
        }
        return new LexerException(currentLine(), currentLineNo(), currentLinePos(), 1, message);
    }

    /**
     * Create an identifier token. This token will always start with a character accepted
     * by {@link #startIdentifier()}.
     *
     * @param buffer The char buffer to be wrapped in the token.
     * @param offset The buffer offset for start of token.
     * @param len The token length.
     * @param lineNo The line number of the token position.
     * @param linePos The line position of the token.
     * @return The created token.
     */
    protected abstract T identifierToken(char[] buffer, int offset, int len, int lineNo, int linePos);

    /**
     * Create a string token. This token will always start and end with a character accepted
     * by {@link #startString()}.
     *
     * @param buffer The char buffer to be wrapped in the token.
     * @param offset The buffer offset for start of token.
     * @param len The token length.
     * @param lineNo The line number of the token position.
     * @param linePos The line position of the token.
     * @return The created token.
     */
    protected abstract T stringToken(char[] buffer, int offset, int len, int lineNo, int linePos);

    /**
     * Create a number token. This token will always start with a character accepted
     * by {@link #startNumber()}.
     *
     * @param buffer The char buffer to be wrapped in the token.
     * @param offset The buffer offset for start of token.
     * @param len The token length.
     * @param lineNo The line number of the token position.
     * @param linePos The line position of the token.
     * @return The created token.
     */
    protected abstract T numberToken(char[] buffer, int offset, int len, int lineNo, int linePos);

    /**
     * Create a symbol token. This token will always start with a character accepted
     * by {@link #startSymbol()}.
     *
     * @param buffer The char buffer to be wrapped in the token.
     * @param offset The buffer offset for start of token.
     * @param len The token length.
     * @param lineNo The line number of the token position.
     * @param linePos The line position of the token.
     * @return The created token.
     */
    protected abstract T symbolToken(char[] buffer, int offset, int len, int lineNo, int linePos);

    /**
     * Create a generic token. This is called from {@link #readUntil(char, Object, boolean)}, and
     * can be used if at some point a custom token type should be created.
     *
     * @param buffer The char buffer to be wrapped in the token.
     * @param offset The buffer offset for start of token.
     * @param len The token length.
     * @param type The type of the token.
     * @param lineNo The line number of the token position.
     * @param linePos The line position of the token.
     * @return The created token.
     */
    protected abstract T genericToken(char[] buffer, int offset, int len, TT type, int lineNo, int linePos);

    // --- Token start char detection.

    /**
     * Whitespace characters are generally ignored, and assumed to have no meaning.
     *
     * @return If the last char is a whitespace. See {@link #lastChar}.
     */
    protected boolean isWhitespace() {
        return lastChar == '\n' ||
               lastChar == '\r' ||
               lastChar == '\t' ||
               lastChar == ' ';
    }


    /**
     * @return If the last character is the valid start of a number. See {@link #lastChar}.
     */
    protected boolean startNumber() {
        return lastChar == '.' ||
               lastChar == '-' ||
               (lastChar >= '0' && lastChar <= '9');
    }

    /**
     * @return If the last character is the valid start of a string. See {@link #lastChar}.
     */
    protected boolean startString() {
        return lastChar == '\"';
    }

    /**
     * @return If the last character is the valid start of an identifier. See {@link #lastChar}.
     */
    protected boolean startIdentifier() {
        return '_' == lastChar ||
               ('a' <= lastChar && lastChar <= 'z') ||
               ('A' <= lastChar && lastChar <= 'Z');
    }

    /**
     * @return If the last character is the valid start of a symbol. See {@link #lastChar}.
     */
    protected boolean startSymbol() {
        return lastChar != '#' &&
               lastChar >= 0x20 &&
               lastChar < 0x7F;
    }

    // --- Internal token allowance helpers.

    /**
     * @param last Character to check.
     * @return If the character is allowed as part of an identifier.
     */
    protected boolean allowIdentifier(int last) {
        return '_' == last || '.' == last ||
               ('a' <= last && last <= 'z') ||
               ('A' <= last && last <= 'Z') ||
               ('0' <= last && last <= '9');
    }

    /**
     * Rules for identifiers is to be a separator joined string of
     * allowed identifier chars. The separator cannot be directly repeated,
     * and the identifier cannot end with a separator.
     *
     * @param last Character to check.
     * @return If the character is separating parts of the identifier.
     */
    protected boolean identifierSeparator(int last) {
        return last == '.';
    }

    /**
     * @param last Character to check.
     * @return If the character is allowed as a type-marker for a integer.
     */
    protected boolean allowAfterInteger(int last) {
        return false;
    }

    /**
     * @param last Character to check
     * @return If the character is allowed as a type-marker for a floating point
     *         number. This is also checked after integers.
     */
    protected boolean allowAfterFloatingPoint(int last) {
        return false;
    }

    // --- CONSUME ---

    /**
     * This method is called when all other start-of-token checks have
     * been met and failed for any non-whitespace char.
     *
     * @return True if the character, or any number of characters was
     *         consumed. False if nothing was consumed. This will result
     *         in an unknown start of token exception.
     * @throws IOException If unable to consume characters.
     */
    protected boolean maybeConsumeSilent() throws IOException {
        // Cheap way to consume simple comments.
        if (lastChar == '#') {
            getRestOfLine();
            lastChar = 0;
            return true;
        }
        return false;
    }

    /**
     * This method is protected so that it can be overridden to
     * produce multi-symbol tokens if necessary.
     *
     * @return Symbol token.
     * @throws IOException If unable to read token.
     */
    protected T nextSymbol() throws IOException {
        lastChar = 0;
        return symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
    }

    // --- Tokenizer ---

    @Override
    public int currentLineNo() {
        return this.lineNo;
    }

    @Override
    public int currentLinePos() {
        return this.linePos;
    }

    @Override
    public CharSequence currentLine() {
        return getLine();
    }

    @Override
    public T readUntil(CharSequence terminator, TT type, boolean allowEof) throws IOException {
        requireNonNull(terminator, "terminator == null");
        requireNonNull(type, "type == null");

        maybeConsolidateBuffer();
        if (lastChar == 0) {
            readNextChar();
        }

        if (terminator.length() == 1) {
            return readUntil(terminator.charAt(0), type, allowEof);
        }

        int startOffset = bufferOffset;
        final int startLineNo = lineNo;
        final int startLinePos = linePos;
        final int termLen = terminator.length();
        final int last = terminator.charAt(terminator.length() - 1);

        int len = 0;
        int total = 0;
        StringBuilder consolidated = null;
        while (lastChar >= 0) {
            ++len;
            ++total;

            if (total >= termLen && lastChar == last) {
                boolean cont = false;
                for (int i = termLen - 1; i >= 0; --i) {
                    final int bp = bufferOffset + 1 - termLen + i;
                    if (bp < 0) {
                        if (consolidated == null) throw new IllegalStateException("Bad line consolidation");
                        if (consolidated.charAt(consolidated.length() + bp) != terminator.charAt(i)) {
                            cont = true;
                            break;
                        }
                    } else if (buffer[bp] != terminator.charAt(i)) {
                        cont = true;
                        break;
                    }
                }
                if (!cont) {
                    len -= terminator.length();
                    break;
                }
            }
            if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
                if (consolidated == null) {
                    consolidated = new StringBuilder();
                }
                consolidated.append(buffer, startOffset, len);
                startOffset = 0;
                len = 0;
            }

            readNextChar();
        }
        if (lastChar <= 0 && !allowEof) {
            throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
        }
        lastChar = 0;
        if (consolidated != null) {
            if (len > 0) {
                consolidated.append(buffer, startOffset, len);
            } else if (len < 0) {
                consolidated.delete(consolidated.length() + len, consolidated.length());
            }
            return genericToken(consolidated.toString().toCharArray(),
                                0,
                                consolidated.length(),
                                type, startLineNo, startLinePos);
        } else if (len > 0) {
            return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
        }
        return null;
    }

    // --- Object ---

    @Override
    public String toString() {
        return getClass().getSimpleName() + "{preLoaded=" + preLoaded + "}";
    }

    private T readUntil(char terminator, TT type, boolean allowEof) throws IOException {
        int startOffset = bufferOffset;
        int startLineNo = lineNo;
        int startLinePos = linePos;

        int len = 0;
        StringBuilder consolidated = null;
        while (lastChar >= 0 && lastChar != terminator) {
            ++len;
            if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
                if (consolidated == null) {
                    consolidated = new StringBuilder();
                }
                consolidated.append(buffer, startOffset, len);
                startOffset = 0;
                len = 0;
            }
            readNextChar();
        }
        if (lastChar < 0 && !allowEof) {
            throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
        }
        lastChar = 0;
        if (consolidated != null) {
            if (len > 0) {
                consolidated.append(buffer, startOffset, len);
            }
            return genericToken(consolidated.toString().toCharArray(),
                                0,
                                consolidated.length(),
                                type, startLineNo, startLinePos);
        } else if (len > 0) {
            return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
        }
        return null;
    }

    @Override
    public T parseNextToken() throws IOException {
        while (lastChar >= 0) {
            if (lastChar == 0) {
                if (!readNextChar()) {
                    break;
                }
            }

            if (isWhitespace()) {
                // ignore
                lastChar = 0;
            } else if (startIdentifier()) {
                return nextIdentifier();
            } else if (startString()) {
                return nextString(lastChar);
            } else if (startNumber()) {
                return nextNumber();
            } else if (startSymbol()) {
                return nextSymbol();
            } else if (!maybeConsumeSilent()) {
                // non-allowed characters not starting a valid token.
                T token = symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
                throw failure(token, "Unknown token initiator '%s'", javaEscape((char) lastChar));
            }
        }

        return null;
    }

    // --- INTERNAL ---

    private T nextIdentifier() throws IOException {
        maybeConsolidateBuffer();

        int startPos = linePos;
        int startOffset = bufferOffset;
        int startLine = lineNo;
        int len = 1;

        int lastLast = lastChar;
        if (!readNextChar()) {
            return identifierToken(buffer, startOffset, len, startLine, startPos);
        }

        while (allowIdentifier(lastChar)) {
            ++len;
            if (lastChar == lastLast && identifierSeparator(lastLast)) {
                T token = identifierToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "Identifier with double '.'");
            } else if (identifierSeparator(lastLast) && !startIdentifier()) {
                T token = identifierToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "Identifier part with invalid start '%c'", lastChar);
            }
            lastLast = lastChar;
            if (!readNextChar()) {
                break;
            }
        }

        if (identifierSeparator(lastLast)) {
            T token = identifierToken(buffer, startOffset, len, startLine, startPos);
            throw failure(token, "Identifier with trailing '%c'", (char) lastLast);
        }

        return identifierToken(buffer, startOffset, len, startLine, startPos);
    }

    private T nextNumber() throws IOException {
        maybeConsolidateBuffer();
        // NOTE: This code is pretty messy because it is a full state-engine
        // to ensure that the parsed number follows the JSON number syntax.
        // Alternatives are:
        //
        // dec = -?0
        // dec = -?.0
        // dec = -?0.0
        // sci = (dec)[eE][+-]?[0-9]+
        // hex = 0x[0-9a-fA-F]+
        // oct = 0[0-7]+
        //
        // It is programmed as a state-engine to be very efficient, but
        // correctly detect valid JSON (and what is invalid if not).

        int startLine   = lineNo;
        int startPos    = linePos;
        int startOffset = bufferOffset;
        // number (any type).
        int len = 0;

        if (lastChar == '-') {
            // only base 10 decimals can be negative.
            ++len;
            if (!readNextChar()) {
                T token = numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "Negative indicator without number");
            }

            if (!(lastChar == '.' || (lastChar >= '0' && lastChar <= '9'))) {
                T token = numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "No decimal after negative indicator");
            }
        } else if (lastChar == '0') {
            if (readNextChar()) {
                ++len;
                if (lastChar == 'x') {
                    ++len;
                    if (!readNextChar()) {
                        T token = numberToken(buffer, startOffset, len, startLine, startPos);
                        throw failure(token, "No decimal after hex indicator");
                    }
                    // hexadecimal.
                    do {
                        if (!((lastChar >= '0' && lastChar <= '9') || (lastChar >= 'a' && lastChar <= 'f') ||
                              (lastChar >= 'A' && lastChar <= 'F'))) {
                            // we read a char that's *not* part of the hex number.
                            break;
                        }
                        ++len;
                    } while (readNextChar());

                    return validateAfterNumber(startOffset, startPos, len);
                } else if ('0' <= lastChar && lastChar <= '7') {
                    ++len;
                    // Octals have 0 in front, and then more digits.
                    while (readNextChar()) {
                        if (lastChar < '0' || lastChar > '7') {
                            break;
                        }
                        ++len;
                    }
                    return validateAfterNumber(startOffset, startPos, len);
                }
            } else {
                // just '0'
                return validateAfterNumber(startOffset, startPos, 1);
            }
        }

        // decimal part.
        while (lastChar >= '0' && lastChar <= '9') {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (!readNextChar()) {
                break;
            }
        }
        // fraction part.
        if (lastChar == '.') {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (readNextChar()) {
                while (lastChar >= '0' && lastChar <= '9') {
                    ++len;
                    // numbers are terminated by first non-numeric character.
                    if (!readNextChar()) {
                        break;
                    }
                }
            }
        } else if (allowAfterInteger(lastChar) ||
                   allowAfterFloatingPoint(lastChar)) {
            ++len;
            // ignore the actual char, but move the 'last' so we can check the char after.
            readNextChar();
            return validateAfterNumber(startOffset, startPos, len);
        }

        // exponent part.
        if (lastChar == 'e' || lastChar == 'E') {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (!readNextChar()) {
                T token = numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token,
                              "Badly terminated number exponent: '%s'", token);
            }

            // The exponent can be explicitly prefixed with both '+'
            // and '-'.
            if (lastChar == '-' || lastChar == '+') {
                ++len;
                // numbers are terminated by first non-numeric character.
                if (!readNextChar()) {
                    T token = numberToken(buffer, startOffset, len, startLine, startPos);
                    throw failure(token, "Badly terminated number exponent: '%s'", token);
                }
            }

            if (lastChar >= '0' && lastChar <= '9') {
                while (lastChar >= '0' && lastChar <= '9') {
                    ++len;
                    // numbers are terminated by first non-numeric character.
                    if (!readNextChar()) {
                        break;
                    }
                }
            } else {
                T token = numberToken(buffer, startOffset, len + 1, startLine, startPos);
                throw failure(token, "Badly terminated number exponent: '%s'", token);
            }
        }

        if (allowAfterFloatingPoint(lastChar)) {
            ++len;
            // ignore the actual char, but move the 'last' so we can check the char after.
            readNextChar();
        }

        return validateAfterNumber(startOffset, startPos, len);
    }

    private T validateAfterNumber(int startOffset, int startLinePos, int len)
            throws IOException {
        // A number must be terminated correctly. must not immediately start identifier or string.
        if (lastChar > 0 && (startIdentifier() || startString() || startNumber())) {
            T token = numberToken(buffer, startOffset, len + 1, lineNo, startLinePos);
            throw failure(token, "Invalid termination of number: '%s'", token);
        } else {
            return numberToken(buffer, startOffset, len, lineNo, startLinePos);
        }
    }

    private T nextString(int quote) throws IOException {
        maybeConsolidateBuffer();

        // strings may be longer than 128 bytes. We may need to build it.
        StringBuilder consolidatedString = null;

        int startLine = lineNo;
        int startPos = linePos;
        int startOffset = bufferOffset;

        boolean esc = false;
        for (; ; ) {
            if (!preLoaded && !bufferLineEnd && bufferOffset >= (bufferLimit - 1)) {
                if (consolidatedString == null) {
                    consolidatedString = new StringBuilder();
                }
                consolidatedString.append(buffer, startOffset, bufferOffset - startOffset + 1);
                startOffset = 0;
            }

            if (!readNextChar()) {
                throw eofFailure("Unexpected end of stream in string");
            }

            if (esc) {
                esc = false;
            } else if (lastChar == '\\') {
                esc = true;
            } else if (lastChar == quote) {
                break;
            } else if (lastChar == '\n' || lastChar == '\r') {
                T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
                throw failure(token, "Unexpected newline in string");
            } else if (lastChar < 0x20 || lastChar == 0x7f ||
                       (lastChar > 0x7f && !ConsoleUtil.isConsolePrintable(lastChar))) {
                T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
                throw failure(token, "Unescaped non-printable char in string: '%s'",
                              javaEscape((char) lastChar));
            }
        }

        lastChar = 0;
        if (consolidatedString != null) {
            consolidatedString.append(buffer, 0, bufferOffset + 1);
            String result = consolidatedString.toString();
            return stringToken(result.toCharArray(), 0, result.length(), startLine, startPos);
        } else {
            return stringToken(buffer, startOffset, bufferOffset - startOffset + 1, startLine, startPos);
        }
    }
}