TokenizerBase.java
- /*
- * Copyright (c) 2015-2020, Stein Eldar Johnsen
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- package net.morimekta.lexer;
- import net.morimekta.strings.ConsoleUtil;
- import net.morimekta.strings.io.LineBufferedReader;
- import java.io.IOException;
- import java.io.Reader;
- import static java.util.Objects.requireNonNull;
- import static net.morimekta.strings.EscapeUtil.javaEscape;
- /**
- * Base tokenizer used around providence. The base tokenizer supports the
- * minimum of what all the tokenizer implementations should require, so that
- * each will mostly expand support, not much reduce it.
- *
- * @param <TT> TokenType type.
- * @param <T> Token type.
- */
- public abstract class TokenizerBase<TT, T extends Token<TT>>
- extends LineBufferedReader
- implements Tokenizer<TT, T> {
- /** Default buffer size, if not specified: 2048 chars / 4 kB. */
- public static final int DEFAULT_BUFFER_SIZE = 1 << 11; // 2048 chars --> 4kB
- /**
- * Create a tokenizer instance.
- *
- * @param in Reader to read.
- * @param bufferSize The size of the read-buffer.
- * @param preLoadAll If the whole content of the reader should be pre-loaded.
- * If this is true, then all token content can be considered
- * immutable.
- */
- protected TokenizerBase(Reader in,
- int bufferSize,
- boolean preLoadAll) {
- super(in, bufferSize, preLoadAll);
- // If the line is longer than 16k, it will not be used in error messages.
- }
- /**
- * Creates a lexer exception from a token with a message. This is an overridable
- * method in order to be able to change the exception class from the get-go in
- * the tokenizer.
- *
- * @param token The token to cause the exception.
- * @param message The exception message.
- * @param params If provided will be used with {@link String#format(String, Object...)} to
- * make a readable exception message.
- * @return The resulting exception.
- */
- public LexerException failure(T token,
- String message,
- Object... params) {
- if (params.length > 0) {
- message = String.format(message, params);
- }
- return new LexerException(token, message);
- }
- /**
- * Creates an EOF lexer exception from a token with a message. This is an overridable
- * method in order to be able to change the exception class from the get-go in
- * the tokenizer. This is similar to {@link #failure(Token, String, Object...)} but
- * does not provide a token, and assumes the tokenizer is at the end of the stream.
- *
- * @param message The exception message.
- * @param params If provided will be used with {@link String#format(String, Object...)} to
- * make a readable exception message.
- * @return The resulting exception.
- */
- protected LexerException eofFailure(String message,
- Object... params) {
- if (params.length > 0) {
- message = String.format(message, params);
- }
- return new LexerException(currentLine(), currentLineNo(), currentLinePos(), 1, message);
- }
- /**
- * Create an identifier token. This token will always start with a character accepted
- * by {@link #startIdentifier()}.
- *
- * @param buffer The char buffer to be wrapped in the token.
- * @param offset The buffer offset for start of token.
- * @param len The token length.
- * @param lineNo The line number of the token position.
- * @param linePos The line position of the token.
- * @return The created token.
- */
- protected abstract T identifierToken(char[] buffer, int offset, int len, int lineNo, int linePos);
- /**
- * Create a string token. This token will always start and end with a character accepted
- * by {@link #startString()}.
- *
- * @param buffer The char buffer to be wrapped in the token.
- * @param offset The buffer offset for start of token.
- * @param len The token length.
- * @param lineNo The line number of the token position.
- * @param linePos The line position of the token.
- * @return The created token.
- */
- protected abstract T stringToken(char[] buffer, int offset, int len, int lineNo, int linePos);
- /**
- * Create a number token. This token will always start with a character accepted
- * by {@link #startNumber()}.
- *
- * @param buffer The char buffer to be wrapped in the token.
- * @param offset The buffer offset for start of token.
- * @param len The token length.
- * @param lineNo The line number of the token position.
- * @param linePos The line position of the token.
- * @return The created token.
- */
- protected abstract T numberToken(char[] buffer, int offset, int len, int lineNo, int linePos);
- /**
- * Create a symbol token. This token will always start with a character accepted
- * by {@link #startSymbol()}.
- *
- * @param buffer The char buffer to be wrapped in the token.
- * @param offset The buffer offset for start of token.
- * @param len The token length.
- * @param lineNo The line number of the token position.
- * @param linePos The line position of the token.
- * @return The created token.
- */
- protected abstract T symbolToken(char[] buffer, int offset, int len, int lineNo, int linePos);
- /**
- * Create a generic token. This is called from {@link #readUntil(char, Object, boolean)}, and
- * can be used if at some point a custom token type should be created.
- *
- * @param buffer The char buffer to be wrapped in the token.
- * @param offset The buffer offset for start of token.
- * @param len The token length.
- * @param type The type of the token.
- * @param lineNo The line number of the token position.
- * @param linePos The line position of the token.
- * @return The created token.
- */
- protected abstract T genericToken(char[] buffer, int offset, int len, TT type, int lineNo, int linePos);
- // --- Token start char detection.
- /**
- * Whitespace characters are generally ignored, and assumed to have no meaning.
- *
- * @return If the last char is a whitespace. See {@link #lastChar}.
- */
- protected boolean isWhitespace() {
- return lastChar == '\n' ||
- lastChar == '\r' ||
- lastChar == '\t' ||
- lastChar == ' ';
- }
- /**
- * @return If the last character is the valid start of a number. See {@link #lastChar}.
- */
- protected boolean startNumber() {
- return lastChar == '.' ||
- lastChar == '-' ||
- (lastChar >= '0' && lastChar <= '9');
- }
- /**
- * @return If the last character is the valid start of a string. See {@link #lastChar}.
- */
- protected boolean startString() {
- return lastChar == '\"';
- }
- /**
- * @return If the last character is the valid start of an identifier. See {@link #lastChar}.
- */
- protected boolean startIdentifier() {
- return '_' == lastChar ||
- ('a' <= lastChar && lastChar <= 'z') ||
- ('A' <= lastChar && lastChar <= 'Z');
- }
- /**
- * @return If the last character is the valid start of a symbol. See {@link #lastChar}.
- */
- protected boolean startSymbol() {
- return lastChar != '#' &&
- lastChar >= 0x20 &&
- lastChar < 0x7F;
- }
- // --- Internal token allowance helpers.
- /**
- * @param last Character to check.
- * @return If the character is allowed as part of an identifier.
- */
- protected boolean allowIdentifier(int last) {
- return '_' == last || '.' == last ||
- ('a' <= last && last <= 'z') ||
- ('A' <= last && last <= 'Z') ||
- ('0' <= last && last <= '9');
- }
- /**
- * Rules for identifiers is to be a separator joined string of
- * allowed identifier chars. The separator cannot be directly repeated,
- * and the identifier cannot end with a separator.
- *
- * @param last Character to check.
- * @return If the character is separating parts of the identifier.
- */
- protected boolean identifierSeparator(int last) {
- return last == '.';
- }
- /**
- * @param last Character to check.
- * @return If the character is allowed as a type-marker for a integer.
- */
- protected boolean allowAfterInteger(int last) {
- return false;
- }
- /**
- * @param last Character to check
- * @return If the character is allowed as a type-marker for a floating point
- * number. This is also checked after integers.
- */
- protected boolean allowAfterFloatingPoint(int last) {
- return false;
- }
- // --- CONSUME ---
- /**
- * This method is called when all other start-of-token checks have
- * been met and failed for any non-whitespace char.
- *
- * @return True if the character, or any number of characters was
- * consumed. False if nothing was consumed. This will result
- * in an unknown start of token exception.
- * @throws IOException If unable to consume characters.
- */
- protected boolean maybeConsumeSilent() throws IOException {
- // Cheap way to consume simple comments.
- if (lastChar == '#') {
- getRestOfLine();
- lastChar = 0;
- return true;
- }
- return false;
- }
- /**
- * This method is protected so that it can be overridden to
- * produce multi-symbol tokens if necessary.
- *
- * @return Symbol token.
- * @throws IOException If unable to read token.
- */
- protected T nextSymbol() throws IOException {
- lastChar = 0;
- return symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
- }
- // --- Tokenizer ---
- @Override
- public int currentLineNo() {
- return this.lineNo;
- }
- @Override
- public int currentLinePos() {
- return this.linePos;
- }
- @Override
- public CharSequence currentLine() {
- return getLine();
- }
- @Override
- public T readUntil(CharSequence terminator, TT type, boolean allowEof) throws IOException {
- requireNonNull(terminator, "terminator == null");
- requireNonNull(type, "type == null");
- maybeConsolidateBuffer();
- if (lastChar == 0) {
- readNextChar();
- }
- if (terminator.length() == 1) {
- return readUntil(terminator.charAt(0), type, allowEof);
- }
- int startOffset = bufferOffset;
- final int startLineNo = lineNo;
- final int startLinePos = linePos;
- final int termLen = terminator.length();
- final int last = terminator.charAt(terminator.length() - 1);
- int len = 0;
- int total = 0;
- StringBuilder consolidated = null;
- while (lastChar >= 0) {
- ++len;
- ++total;
- if (total >= termLen && lastChar == last) {
- boolean cont = false;
- for (int i = termLen - 1; i >= 0; --i) {
- final int bp = bufferOffset + 1 - termLen + i;
- if (bp < 0) {
- if (consolidated == null) throw new IllegalStateException("Bad line consolidation");
- if (consolidated.charAt(consolidated.length() + bp) != terminator.charAt(i)) {
- cont = true;
- break;
- }
- } else if (buffer[bp] != terminator.charAt(i)) {
- cont = true;
- break;
- }
- }
- if (!cont) {
- len -= terminator.length();
- break;
- }
- }
- if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
- if (consolidated == null) {
- consolidated = new StringBuilder();
- }
- consolidated.append(buffer, startOffset, len);
- startOffset = 0;
- len = 0;
- }
- readNextChar();
- }
- if (lastChar <= 0 && !allowEof) {
- throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
- }
- lastChar = 0;
- if (consolidated != null) {
- if (len > 0) {
- consolidated.append(buffer, startOffset, len);
- } else if (len < 0) {
- consolidated.delete(consolidated.length() + len, consolidated.length());
- }
- return genericToken(consolidated.toString().toCharArray(),
- 0,
- consolidated.length(),
- type, startLineNo, startLinePos);
- } else if (len > 0) {
- return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
- }
- return null;
- }
- // --- Object ---
- @Override
- public String toString() {
- return getClass().getSimpleName() + "{preLoaded=" + preLoaded + "}";
- }
- private T readUntil(char terminator, TT type, boolean allowEof) throws IOException {
- int startOffset = bufferOffset;
- int startLineNo = lineNo;
- int startLinePos = linePos;
- int len = 0;
- StringBuilder consolidated = null;
- while (lastChar >= 0 && lastChar != terminator) {
- ++len;
- if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
- if (consolidated == null) {
- consolidated = new StringBuilder();
- }
- consolidated.append(buffer, startOffset, len);
- startOffset = 0;
- len = 0;
- }
- readNextChar();
- }
- if (lastChar < 0 && !allowEof) {
- throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
- }
- lastChar = 0;
- if (consolidated != null) {
- if (len > 0) {
- consolidated.append(buffer, startOffset, len);
- }
- return genericToken(consolidated.toString().toCharArray(),
- 0,
- consolidated.length(),
- type, startLineNo, startLinePos);
- } else if (len > 0) {
- return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
- }
- return null;
- }
- @Override
- public T parseNextToken() throws IOException {
- while (lastChar >= 0) {
- if (lastChar == 0) {
- if (!readNextChar()) {
- break;
- }
- }
- if (isWhitespace()) {
- // ignore
- lastChar = 0;
- } else if (startIdentifier()) {
- return nextIdentifier();
- } else if (startString()) {
- return nextString(lastChar);
- } else if (startNumber()) {
- return nextNumber();
- } else if (startSymbol()) {
- return nextSymbol();
- } else if (!maybeConsumeSilent()) {
- // non-allowed characters not starting a valid token.
- T token = symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
- throw failure(token, "Unknown token initiator '%s'", javaEscape((char) lastChar));
- }
- }
- return null;
- }
- // --- INTERNAL ---
- private T nextIdentifier() throws IOException {
- maybeConsolidateBuffer();
- int startPos = linePos;
- int startOffset = bufferOffset;
- int startLine = lineNo;
- int len = 1;
- int lastLast = lastChar;
- if (!readNextChar()) {
- return identifierToken(buffer, startOffset, len, startLine, startPos);
- }
- while (allowIdentifier(lastChar)) {
- ++len;
- if (lastChar == lastLast && identifierSeparator(lastLast)) {
- T token = identifierToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "Identifier with double '.'");
- } else if (identifierSeparator(lastLast) && !startIdentifier()) {
- T token = identifierToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "Identifier part with invalid start '%c'", lastChar);
- }
- lastLast = lastChar;
- if (!readNextChar()) {
- break;
- }
- }
- if (identifierSeparator(lastLast)) {
- T token = identifierToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "Identifier with trailing '%c'", (char) lastLast);
- }
- return identifierToken(buffer, startOffset, len, startLine, startPos);
- }
- private T nextNumber() throws IOException {
- maybeConsolidateBuffer();
- // NOTE: This code is pretty messy because it is a full state-engine
- // to ensure that the parsed number follows the JSON number syntax.
- // Alternatives are:
- //
- // dec = -?0
- // dec = -?.0
- // dec = -?0.0
- // sci = (dec)[eE][+-]?[0-9]+
- // hex = 0x[0-9a-fA-F]+
- // oct = 0[0-7]+
- //
- // It is programmed as a state-engine to be very efficient, but
- // correctly detect valid JSON (and what is invalid if not).
- int startLine = lineNo;
- int startPos = linePos;
- int startOffset = bufferOffset;
- // number (any type).
- int len = 0;
- if (lastChar == '-') {
- // only base 10 decimals can be negative.
- ++len;
- if (!readNextChar()) {
- T token = numberToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "Negative indicator without number");
- }
- if (!(lastChar == '.' || (lastChar >= '0' && lastChar <= '9'))) {
- T token = numberToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "No decimal after negative indicator");
- }
- } else if (lastChar == '0') {
- if (readNextChar()) {
- ++len;
- if (lastChar == 'x') {
- ++len;
- if (!readNextChar()) {
- T token = numberToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "No decimal after hex indicator");
- }
- // hexadecimal.
- do {
- if (!((lastChar >= '0' && lastChar <= '9') || (lastChar >= 'a' && lastChar <= 'f') ||
- (lastChar >= 'A' && lastChar <= 'F'))) {
- // we read a char that's *not* part of the hex number.
- break;
- }
- ++len;
- } while (readNextChar());
- return validateAfterNumber(startOffset, startPos, len);
- } else if ('0' <= lastChar && lastChar <= '7') {
- ++len;
- // Octals have 0 in front, and then more digits.
- while (readNextChar()) {
- if (lastChar < '0' || lastChar > '7') {
- break;
- }
- ++len;
- }
- return validateAfterNumber(startOffset, startPos, len);
- }
- } else {
- // just '0'
- return validateAfterNumber(startOffset, startPos, 1);
- }
- }
- // decimal part.
- while (lastChar >= '0' && lastChar <= '9') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (!readNextChar()) {
- break;
- }
- }
- // fraction part.
- if (lastChar == '.') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (readNextChar()) {
- while (lastChar >= '0' && lastChar <= '9') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (!readNextChar()) {
- break;
- }
- }
- }
- } else if (allowAfterInteger(lastChar) ||
- allowAfterFloatingPoint(lastChar)) {
- ++len;
- // ignore the actual char, but move the 'last' so we can check the char after.
- readNextChar();
- return validateAfterNumber(startOffset, startPos, len);
- }
- // exponent part.
- if (lastChar == 'e' || lastChar == 'E') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (!readNextChar()) {
- T token = numberToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token,
- "Badly terminated number exponent: '%s'", token);
- }
- // The exponent can be explicitly prefixed with both '+'
- // and '-'.
- if (lastChar == '-' || lastChar == '+') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (!readNextChar()) {
- T token = numberToken(buffer, startOffset, len, startLine, startPos);
- throw failure(token, "Badly terminated number exponent: '%s'", token);
- }
- }
- if (lastChar >= '0' && lastChar <= '9') {
- while (lastChar >= '0' && lastChar <= '9') {
- ++len;
- // numbers are terminated by first non-numeric character.
- if (!readNextChar()) {
- break;
- }
- }
- } else {
- T token = numberToken(buffer, startOffset, len + 1, startLine, startPos);
- throw failure(token, "Badly terminated number exponent: '%s'", token);
- }
- }
- if (allowAfterFloatingPoint(lastChar)) {
- ++len;
- // ignore the actual char, but move the 'last' so we can check the char after.
- readNextChar();
- }
- return validateAfterNumber(startOffset, startPos, len);
- }
- private T validateAfterNumber(int startOffset, int startLinePos, int len)
- throws IOException {
- // A number must be terminated correctly. must not immediately start identifier or string.
- if (lastChar > 0 && (startIdentifier() || startString() || startNumber())) {
- T token = numberToken(buffer, startOffset, len + 1, lineNo, startLinePos);
- throw failure(token, "Invalid termination of number: '%s'", token);
- } else {
- return numberToken(buffer, startOffset, len, lineNo, startLinePos);
- }
- }
- private T nextString(int quote) throws IOException {
- maybeConsolidateBuffer();
- // strings may be longer than 128 bytes. We may need to build it.
- StringBuilder consolidatedString = null;
- int startLine = lineNo;
- int startPos = linePos;
- int startOffset = bufferOffset;
- boolean esc = false;
- for (; ; ) {
- if (!preLoaded && !bufferLineEnd && bufferOffset >= (bufferLimit - 1)) {
- if (consolidatedString == null) {
- consolidatedString = new StringBuilder();
- }
- consolidatedString.append(buffer, startOffset, bufferOffset - startOffset + 1);
- startOffset = 0;
- }
- if (!readNextChar()) {
- throw eofFailure("Unexpected end of stream in string");
- }
- if (esc) {
- esc = false;
- } else if (lastChar == '\\') {
- esc = true;
- } else if (lastChar == quote) {
- break;
- } else if (lastChar == '\n' || lastChar == '\r') {
- T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
- throw failure(token, "Unexpected newline in string");
- } else if (lastChar < 0x20 || lastChar == 0x7f ||
- (lastChar > 0x7f && !ConsoleUtil.isConsolePrintable(lastChar))) {
- T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
- throw failure(token, "Unescaped non-printable char in string: '%s'",
- javaEscape((char) lastChar));
- }
- }
- lastChar = 0;
- if (consolidatedString != null) {
- consolidatedString.append(buffer, 0, bufferOffset + 1);
- String result = consolidatedString.toString();
- return stringToken(result.toCharArray(), 0, result.length(), startLine, startPos);
- } else {
- return stringToken(buffer, startOffset, bufferOffset - startOffset + 1, startLine, startPos);
- }
- }
- }