TokenizerBase.java

/*
 * Copyright (c) 2015-2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.lexer;

import net.morimekta.strings.ConsoleUtil;
import net.morimekta.strings.io.LineBufferedReader;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.EscapeUtil.javaEscape;

/**
 * Base tokenizer used around providence. The base tokenizer supports the
 * minimum of what all the tokenizer implementations should require, so that
 * each will mostly expand support, not much reduce it.
 *
 * @param <TT> TokenType type.
 * @param <T>  Token type.
 */
public abstract class TokenizerBase<TT extends TokenType, T extends Token<TT>>
        extends LineBufferedReader
        implements Tokenizer<TT, T> {
    protected final TokenClassifier     classifier;
    protected final TokenFactory<TT, T> factory;
    private final   List<T>             skipped = new ArrayList<>();

    /**
     * Default buffer size, if not specified: 2048 chars / 4 kB.
     */
    public static final int DEFAULT_BUFFER_SIZE = 1 << 11; // 2048 chars --> 4kB

    /**
     * Create a tokenizer instance.
     *
     * @param classifier Token classifier.
     * @param factory    Token factory.
     * @param in         Reader to read.
     * @param bufferSize The size of the read-buffer.
     * @param preLoadAll If the whole content of the reader should be pre-loaded.
     *                   If this is true, then all token content can be considered
     *                   immutable.
     */
    protected TokenizerBase(TokenClassifier classifier,
                            TokenFactory<TT, T> factory,
                            Reader in,
                            int bufferSize,
                            boolean preLoadAll) {
        super(in, bufferSize, preLoadAll);
        this.classifier = classifier;
        this.factory = factory;
        // If the line is longer than 16k, it will not be used in error messages.
    }

    /**
     * Creates a lexer exception from a token with a message. This is an overridable
     * method in order to be able to change the exception class from the get-go in
     * the tokenizer.
     *
     * @param token   The token to cause the exception.
     * @param message The exception message.
     * @param params  If provided will be used with {@link String#format(String, Object...)} to
     *                make a readable exception message.
     * @return The resulting exception.
     */
    public LexerException failure(T token,
                                  String message,
                                  Object... params) {
        if (params.length > 0) {
            message = String.format(message, params);
        }
        return new LexerException(token, message);
    }

    /**
     * Creates an EOF lexer exception from a token with a message. This is an overridable
     * method in order to be able to change the exception class from the get-go in
     * the tokenizer. This is similar to {@link #failure(Token, String, Object...)} but
     * does not provide a token, and assumes the tokenizer is at the end of the stream.
     *
     * @param message The exception message.
     * @param params  If provided will be used with {@link String#format(String, Object...)} to
     *                make a readable exception message.
     * @return The resulting exception.
     */
    protected LexerException eofFailure(String message,
                                        Object... params) {
        if (params.length > 0) {
            message = String.format(message, params);
        }
        return new LexerException(currentLine(), currentLineNo(), currentLinePos(), 1, message);
    }


    // --- CONSUME ---

    /**
     * This method is called when all other start-of-token checks have
     * been met and failed for any non-whitespace char.
     *
     * @return True if the character, or any number of characters was
     * consumed. False if nothing was consumed. This will result
     * in an unknown start of token exception.
     */
    protected boolean maybeConsumeSilent(T token) {
        // Cheap way to consume simple comments.
        return token.type == factory.lineCommentTokenType();
    }

    public boolean hasSkippedTokens() {
        return !skipped.isEmpty();
    }

    public List<T> getSkippedTokens() {
        return List.copyOf(skipped);
    }

    public List<T> clearSkippedTokens() {
        try {
            return List.copyOf(skipped);
        } finally {
            skipped.clear();
        }
    }

    // --- Tokenizer ---
    @Override
    public int currentLineNo() {
        return this.lineNo;
    }

    @Override
    public int currentLinePos() {
        return this.linePos;
    }

    @Override
    public CharSequence currentLine() {
        return getLine();
    }

    @Override
    public T readUntil(CharSequence terminator, TT type, boolean allowEof) throws IOException {
        requireNonNull(terminator, "terminator == null");
        requireNonNull(type, "type == null");

        maybeConsolidateBuffer();
        if (lastChar == 0) {
            readNextChar();
        }

        if (terminator.length() == 1) {
            return readUntil(terminator.charAt(0), type, allowEof);
        }

        int       startOffset  = bufferOffset;
        final int startLineNo  = lineNo;
        final int startLinePos = linePos;
        final int termLen      = terminator.length();
        final int last         = terminator.charAt(terminator.length() - 1);

        int           len          = 0;
        int           total        = 0;
        StringBuilder consolidated = null;
        while (lastChar >= 0) {
            ++len;
            ++total;

            if (total >= termLen && lastChar == last) {
                boolean cont = false;
                for (int i = termLen - 1; i >= 0; --i) {
                    final int bp = bufferOffset + 1 - termLen + i;
                    if (bp < 0) {
                        if (consolidated == null) throw new IllegalStateException("Bad line consolidation");
                        if (consolidated.charAt(consolidated.length() + bp) != terminator.charAt(i)) {
                            cont = true;
                            break;
                        }
                    } else if (buffer[bp] != terminator.charAt(i)) {
                        cont = true;
                        break;
                    }
                }
                if (!cont) {
                    len -= terminator.length();
                    break;
                }
            }
            if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
                if (consolidated == null) {
                    consolidated = new StringBuilder();
                }
                consolidated.append(buffer, startOffset, len);
                startOffset = 0;
                len = 0;
            }

            readNextChar();
        }
        if (lastChar <= 0 && !allowEof) {
            throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
        }
        lastChar = 0;
        if (consolidated != null) {
            if (len > 0) {
                consolidated.append(buffer, startOffset, len);
            } else if (len < 0) {
                consolidated.delete(consolidated.length() + len, consolidated.length());
            }
            return factory.genericToken(consolidated.toString().toCharArray(),
                                        0,
                                        consolidated.length(),
                                        type, startLineNo, startLinePos);
        } else if (len > 0) {
            return factory.genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
        }
        return null;
    }


    // --- Object ---
    @Override
    public String toString() {
        return getClass().getSimpleName() + "{preLoaded=" + preLoaded + "}";
    }

    private T readUntil(char terminator, TT type, boolean allowEof) throws IOException {
        int startOffset  = bufferOffset;
        int startLineNo  = lineNo;
        int startLinePos = linePos;

        int           len          = 0;
        StringBuilder consolidated = null;
        while (lastChar >= 0 && lastChar != terminator) {
            ++len;
            if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
                if (consolidated == null) {
                    consolidated = new StringBuilder();
                }
                consolidated.append(buffer, startOffset, len);
                startOffset = 0;
                len = 0;
            }
            readNextChar();
        }
        if (lastChar < 0 && !allowEof) {
            throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
        }
        lastChar = 0;
        if (consolidated != null) {
            if (len > 0) {
                consolidated.append(buffer, startOffset, len);
            }
            return factory.genericToken(
                    consolidated.toString().toCharArray(),
                    0,
                    consolidated.length(),
                    type, startLineNo, startLinePos);
        } else if (len > 0) {
            return factory.genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
        }
        return null;
    }

    @Override
    public T parseNextToken() throws IOException {
        var next = parseNextTokenInternal();
        while (next != null && maybeConsumeSilent(next)) {
            skipped.add(next);
            next = parseNextTokenInternal();
        }

        return next;
    }

    private T parseNextTokenInternal() throws IOException {
        while (lastChar >= 0) {
            if (lastChar == 0) {
                if (!readNextChar()) {
                    break;
                }
            }

            if (classifier.isWhitespace(lastChar)) {
                // ignore
                lastChar = 0;
            } else if (classifier.startIdentifier(lastChar)) {
                return nextIdentifier(lastChar);
            } else if (classifier.startString(lastChar)) {
                return nextString(lastChar);
            } else if (classifier.startNumber(lastChar)) {
                return nextNumber();
            } else if (classifier.startLineComment(lastChar)) {
                return nextLineComment();
            } else if (classifier.startSymbol(lastChar)) {
                return nextSymbol();
            } else {
                // non-allowed characters not starting a valid token.
                T token = factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
                throw failure(token, "Unknown token initiator '%s'", javaEscape((char) lastChar));
            }
        }

        return null;
    }

    // --- INTERNAL ---

    /**
     * @return Read the next identifier from buffer.
     * @throws IOException If unable to read.
     */
    protected T nextIdentifier(int first) throws IOException {
        maybeConsolidateBuffer();

        int startPos    = linePos;
        int startOffset = bufferOffset;
        int startLine   = lineNo;
        int len         = 1;
        int lastLast;
        if (first != lastChar) {
            lastLast = first;
            --len;
        } else {
            lastLast = lastChar;
            if (!readNextChar()) {
                return factory.identifierToken(buffer, startOffset, len, startLine, startPos);
            }
        }
        if (classifier.allowBeforeString(lastLast) &&
            classifier.startString(lastChar)) {
            return nextString(lastLast);
        }

        while (classifier.allowIdentifier(lastChar) || classifier.identifierSeparator(lastChar)) {
            ++len;
            if (classifier.identifierSeparator(lastLast) &&
                !classifier.startSecondaryIdentifier(lastChar, lastLast)) {
                if (classifier.endIdentifier(lastLast, lastChar)) {
                    T token = factory.identifierToken(buffer, startOffset, len - 2, startLine, startPos);
                    // rewind a single read.
                    this.lastChar = lastLast;
                    --this.bufferOffset;
                    --this.linePos;
                    return token;
                } else if (classifier.identifierSeparator(lastChar)) {
                    T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
                    throw failure(token, "Identifier with double separator '%c%c'", lastLast, lastChar);
                } else {
                    T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
                    throw failure(token, "Identifier part with invalid start '%c'", lastChar);
                }
            }
            lastLast = lastChar;
            if (!readNextChar()) {
                break;
            }
        }

        if (classifier.identifierSeparator(lastLast)) {
            T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
            throw failure(token, "Identifier with trailing '%c'", (char) lastLast);
        }

        return factory.identifierToken(buffer, startOffset, len, startLine, startPos);
    }

    protected T nextLineComment() throws IOException {
        return readUntil('\n', factory.lineCommentTokenType(), true);
    }

    protected T nextNumber() throws IOException {
        maybeConsolidateBuffer();
        // NOTE: This code is pretty messy because it is a full state-engine
        // to ensure that the parsed number follows the JSON number syntax.
        // Alternatives are:
        //
        // dec = -?0
        // dec = -?.0
        // dec = -?0.0
        // sci = (dec)[eE][+-]?[0-9]+
        // hex = 0x[0-9a-fA-F]+
        // oct = 0[0-7]+
        //
        // It is programmed as a state-engine to be very efficient, but
        // correctly detect valid JSON (and what is invalid if not).

        int startLine   = lineNo;
        int startPos    = linePos;
        int startOffset = bufferOffset;
        // number (any type).
        int len = 0;

        if (classifier.numberSign(lastChar)) {
            // only base 10 decimals can be negative.
            ++len;
            if (!readNextChar()) {
                T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "Negative indicator without number");
            }

            if (!(lastChar == '.' || (lastChar >= '0' && lastChar <= '9'))) {
                T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token, "No decimal after negative indicator");
            }
        } else if (classifier.numberEncodingIndicator(lastChar)) {
            if (readNextChar()) {
                ++len;
                if (lastChar == 'x') {
                    ++len;
                    if (!readNextChar()) {
                        T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
                        throw failure(token, "No decimal after hex indicator");
                    }
                    // hexadecimal.
                    do {
                        if (!((lastChar >= '0' && lastChar <= '9') || (lastChar >= 'a' && lastChar <= 'f') ||
                              (lastChar >= 'A' && lastChar <= 'F'))) {
                            // we read a char that's *not* part of the hex number.
                            break;
                        }
                        ++len;
                    } while (readNextChar());

                    return validateAfterNumber(startOffset, startPos, len);
                } else if ('0' <= lastChar && lastChar <= '7') {
                    ++len;
                    // Octals have 0 in front, and then more digits.
                    while (readNextChar()) {
                        if (lastChar < '0' || lastChar > '7') {
                            break;
                        }
                        ++len;
                    }
                    return validateAfterNumber(startOffset, startPos, len);
                }
            } else {
                // just '0'
                return validateAfterNumber(startOffset, startPos, 1);
            }
        }

        // decimal part.
        while (lastChar >= '0' && lastChar <= '9') {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (!readNextChar()) {
                break;
            }
        }
        // fraction part.
        if (classifier.numberDecimalSep(lastChar)) {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (readNextChar()) {
                while (lastChar >= '0' && lastChar <= '9') {
                    ++len;
                    // numbers are terminated by first non-numeric character.
                    if (!readNextChar()) {
                        break;
                    }
                }
            }
        } else if (classifier.allowAfterInteger(lastChar) ||
                   classifier.allowAfterFloatingPoint(lastChar)) {
            ++len;
            // ignore the actual char, but move the 'last' so we can check the char after.
            readNextChar();
            return validateAfterNumber(startOffset, startPos, len);
        }

        // exponent part.
        if (classifier.numberExponentSep(lastChar)) {
            ++len;
            // numbers are terminated by first non-numeric character.
            if (!readNextChar()) {
                T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
                throw failure(token,
                              "Badly terminated number exponent: '%s'", token);
            }

            // The exponent can be explicitly prefixed with both '+' and '-'.
            if (classifier.numberExponentSign(lastChar)) {
                ++len;
                // numbers are terminated by first non-numeric character.
                if (!readNextChar()) {
                    T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
                    throw failure(token, "Badly terminated number exponent: '%s'", token);
                }
            }

            if (lastChar >= '0' && lastChar <= '9') {
                while (lastChar >= '0' && lastChar <= '9') {
                    ++len;
                    // numbers are terminated by first non-numeric character.
                    if (!readNextChar()) {
                        break;
                    }
                }
            } else {
                T token = factory.numberToken(buffer, startOffset, len + 1, startLine, startPos);
                throw failure(token, "Badly terminated number exponent: '%s'", token);
            }
        }

        if (classifier.allowAfterFloatingPoint(lastChar)) {
            ++len;
            // ignore the actual char, but move the 'last' so we can check the char after.
            readNextChar();
        }

        return validateAfterNumber(startOffset, startPos, len);
    }

    private T validateAfterNumber(int startOffset, int startLinePos, int len)
            throws IOException {
        // A number must be terminated correctly. must not immediately start identifier or string.
        if (classifier.startIdentifier(lastChar) ||
            classifier.startString(lastChar) ||
            classifier.startNumber(lastChar)) {
            T token = factory.numberToken(buffer, startOffset, len + 1, lineNo, startLinePos);
            throw failure(token, "Invalid termination of number: '%s'", token);
        } else {
            return factory.numberToken(buffer, startOffset, len, lineNo, startLinePos);
        }
    }

    private T nextString(int quote) throws IOException {
        maybeConsolidateBuffer();

        // strings may be longer than 128 bytes. We may need to build it.
        StringBuilder consolidatedString = null;

        int startLine   = lineNo;
        int startPos    = linePos;
        int startOffset = bufferOffset;
        int endOffset   = 0;

        if (!classifier.startString(quote)) {
            --startPos;
            --startOffset;
            quote = lastChar;
        }

        boolean esc = false;
        for (; ; ) {
            if (!preLoaded && !bufferLineEnd && bufferOffset >= (bufferLimit - 1)) {
                if (consolidatedString == null) {
                    consolidatedString = new StringBuilder();
                }
                consolidatedString.append(buffer, startOffset, bufferOffset - startOffset + 1);
                startOffset = 0;
            }

            if (!readNextChar()) {
                throw eofFailure("Unexpected end of stream in string");
            }

            if (esc) {
                esc = false;
            } else if (lastChar == '\\') {
                esc = true;
            } else if (lastChar == quote) {
                if (readNextChar()) {
                    if (classifier.allowAfterString(lastChar)) {
                        if (readNextChar()) {
                            if (classifier.startIdentifier(lastChar) || classifier.startString(lastChar)) {
                                T token = factory.identifierToken(buffer, bufferOffset, 1, startLine, linePos);
                                throw failure(token, "Unexpected token after string marker: '%s'", token);
                            }
                        }
                    }
                }
                break;
            } else if (lastChar == '\n' || lastChar == '\r') {
                T token = factory.symbolToken(buffer, bufferOffset, 1, startLine, linePos);
                throw failure(token, "Unexpected newline in string");
            } else if (lastChar < 0x20 || lastChar == 0x7f ||
                       (lastChar > 0x7f && !ConsoleUtil.isConsolePrintable(lastChar))) {
                T token = factory.symbolToken(buffer, bufferOffset, 1, startLine, linePos);
                throw failure(token, "Unescaped non-printable char in string: '%s'",
                              javaEscape((char) lastChar));
            }
        }

        if (consolidatedString != null) {
            consolidatedString.append(buffer, 0, bufferOffset + endOffset);
            String result = consolidatedString.toString();
            return factory.stringToken(result.toCharArray(), 0, result.length(), startLine, startPos);
        } else {
            return factory.stringToken(buffer,
                                       startOffset,
                                       (bufferOffset - startOffset) + endOffset,
                                       startLine,
                                       startPos);
        }
    }

    /**
     * This method is protected so that it can be overridden to
     * produce multi-symbol tokens if necessary.
     *
     * @return Symbol token.
     * @throws IOException If unable to read token.
     */
    protected T nextSymbol() throws IOException {
        if (classifier.allowBeforeIdentifier(lastChar)) {
            var first = lastChar;
            if (!readNextChar()) {
                factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
            }
            if (classifier.startIdentifier(lastChar)) {
                --linePos;
                --bufferOffset;
                return nextIdentifier(first);
            }
            return factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
        }
        lastChar = 0;
        return factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
    }
}