TokenizerBase.java
/*
* Copyright (c) 2015-2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.lexer;
import net.morimekta.strings.ConsoleUtil;
import net.morimekta.strings.io.LineBufferedReader;
import java.io.IOException;
import java.io.Reader;
import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.EscapeUtil.javaEscape;
/**
* Base tokenizer used around providence. The base tokenizer supports the
* minimum of what all the tokenizer implementations should require, so that
* each will mostly expand support, not much reduce it.
*
* @param <TT> TokenType type.
* @param <T> Token type.
*/
public abstract class TokenizerBase<TT, T extends Token<TT>>
extends LineBufferedReader
implements Tokenizer<TT, T> {
/** Default buffer size, if not specified: 2048 chars / 4 kB. */
public static final int DEFAULT_BUFFER_SIZE = 1 << 11; // 2048 chars --> 4kB
/**
* Create a tokenizer instance.
*
* @param in Reader to read.
* @param bufferSize The size of the read-buffer.
* @param preLoadAll If the whole content of the reader should be pre-loaded.
* If this is true, then all token content can be considered
* immutable.
*/
protected TokenizerBase(Reader in,
int bufferSize,
boolean preLoadAll) {
super(in, bufferSize, preLoadAll);
// If the line is longer than 16k, it will not be used in error messages.
}
/**
* Creates a lexer exception from a token with a message. This is an overridable
* method in order to be able to change the exception class from the get-go in
* the tokenizer.
*
* @param token The token to cause the exception.
* @param message The exception message.
* @param params If provided will be used with {@link String#format(String, Object...)} to
* make a readable exception message.
* @return The resulting exception.
*/
public LexerException failure(T token,
String message,
Object... params) {
if (params.length > 0) {
message = String.format(message, params);
}
return new LexerException(token, message);
}
/**
* Creates an EOF lexer exception from a token with a message. This is an overridable
* method in order to be able to change the exception class from the get-go in
* the tokenizer. This is similar to {@link #failure(Token, String, Object...)} but
* does not provide a token, and assumes the tokenizer is at the end of the stream.
*
* @param message The exception message.
* @param params If provided will be used with {@link String#format(String, Object...)} to
* make a readable exception message.
* @return The resulting exception.
*/
protected LexerException eofFailure(String message,
Object... params) {
if (params.length > 0) {
message = String.format(message, params);
}
return new LexerException(currentLine(), currentLineNo(), currentLinePos(), 1, message);
}
/**
* Create an identifier token. This token will always start with a character accepted
* by {@link #startIdentifier()}.
*
* @param buffer The char buffer to be wrapped in the token.
* @param offset The buffer offset for start of token.
* @param len The token length.
* @param lineNo The line number of the token position.
* @param linePos The line position of the token.
* @return The created token.
*/
protected abstract T identifierToken(char[] buffer, int offset, int len, int lineNo, int linePos);
/**
* Create a string token. This token will always start and end with a character accepted
* by {@link #startString()}.
*
* @param buffer The char buffer to be wrapped in the token.
* @param offset The buffer offset for start of token.
* @param len The token length.
* @param lineNo The line number of the token position.
* @param linePos The line position of the token.
* @return The created token.
*/
protected abstract T stringToken(char[] buffer, int offset, int len, int lineNo, int linePos);
/**
* Create a number token. This token will always start with a character accepted
* by {@link #startNumber()}.
*
* @param buffer The char buffer to be wrapped in the token.
* @param offset The buffer offset for start of token.
* @param len The token length.
* @param lineNo The line number of the token position.
* @param linePos The line position of the token.
* @return The created token.
*/
protected abstract T numberToken(char[] buffer, int offset, int len, int lineNo, int linePos);
/**
* Create a symbol token. This token will always start with a character accepted
* by {@link #startSymbol()}.
*
* @param buffer The char buffer to be wrapped in the token.
* @param offset The buffer offset for start of token.
* @param len The token length.
* @param lineNo The line number of the token position.
* @param linePos The line position of the token.
* @return The created token.
*/
protected abstract T symbolToken(char[] buffer, int offset, int len, int lineNo, int linePos);
/**
* Create a generic token. This is called from {@link #readUntil(char, Object, boolean)}, and
* can be used if at some point a custom token type should be created.
*
* @param buffer The char buffer to be wrapped in the token.
* @param offset The buffer offset for start of token.
* @param len The token length.
* @param type The type of the token.
* @param lineNo The line number of the token position.
* @param linePos The line position of the token.
* @return The created token.
*/
protected abstract T genericToken(char[] buffer, int offset, int len, TT type, int lineNo, int linePos);
// --- Token start char detection.
/**
* Whitespace characters are generally ignored, and assumed to have no meaning.
*
* @return If the last char is a whitespace. See {@link #lastChar}.
*/
protected boolean isWhitespace() {
return lastChar == '\n' ||
lastChar == '\r' ||
lastChar == '\t' ||
lastChar == ' ';
}
/**
* @return If the last character is the valid start of a number. See {@link #lastChar}.
*/
protected boolean startNumber() {
return lastChar == '.' ||
lastChar == '-' ||
(lastChar >= '0' && lastChar <= '9');
}
/**
* @return If the last character is the valid start of a string. See {@link #lastChar}.
*/
protected boolean startString() {
return lastChar == '\"';
}
/**
* @return If the last character is the valid start of an identifier. See {@link #lastChar}.
*/
protected boolean startIdentifier() {
return '_' == lastChar ||
('a' <= lastChar && lastChar <= 'z') ||
('A' <= lastChar && lastChar <= 'Z');
}
/**
* @return If the last character is the valid start of a symbol. See {@link #lastChar}.
*/
protected boolean startSymbol() {
return lastChar != '#' &&
lastChar >= 0x20 &&
lastChar < 0x7F;
}
// --- Internal token allowance helpers.
/**
* @param last Character to check.
* @return If the character is allowed as part of an identifier.
*/
protected boolean allowIdentifier(int last) {
return '_' == last || '.' == last ||
('a' <= last && last <= 'z') ||
('A' <= last && last <= 'Z') ||
('0' <= last && last <= '9');
}
/**
* Rules for identifiers is to be a separator joined string of
* allowed identifier chars. The separator cannot be directly repeated,
* and the identifier cannot end with a separator.
*
* @param last Character to check.
* @return If the character is separating parts of the identifier.
*/
protected boolean identifierSeparator(int last) {
return last == '.';
}
/**
* @param last Character to check.
* @return If the character is allowed as a type-marker for a integer.
*/
protected boolean allowAfterInteger(int last) {
return false;
}
/**
* @param last Character to check
* @return If the character is allowed as a type-marker for a floating point
* number. This is also checked after integers.
*/
protected boolean allowAfterFloatingPoint(int last) {
return false;
}
// --- CONSUME ---
/**
* This method is called when all other start-of-token checks have
* been met and failed for any non-whitespace char.
*
* @return True if the character, or any number of characters was
* consumed. False if nothing was consumed. This will result
* in an unknown start of token exception.
* @throws IOException If unable to consume characters.
*/
protected boolean maybeConsumeSilent() throws IOException {
// Cheap way to consume simple comments.
if (lastChar == '#') {
getRestOfLine();
lastChar = 0;
return true;
}
return false;
}
/**
* This method is protected so that it can be overridden to
* produce multi-symbol tokens if necessary.
*
* @return Symbol token.
* @throws IOException If unable to read token.
*/
protected T nextSymbol() throws IOException {
lastChar = 0;
return symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
}
// --- Tokenizer ---
@Override
public int currentLineNo() {
return this.lineNo;
}
@Override
public int currentLinePos() {
return this.linePos;
}
@Override
public CharSequence currentLine() {
return getLine();
}
@Override
public T readUntil(CharSequence terminator, TT type, boolean allowEof) throws IOException {
requireNonNull(terminator, "terminator == null");
requireNonNull(type, "type == null");
maybeConsolidateBuffer();
if (lastChar == 0) {
readNextChar();
}
if (terminator.length() == 1) {
return readUntil(terminator.charAt(0), type, allowEof);
}
int startOffset = bufferOffset;
final int startLineNo = lineNo;
final int startLinePos = linePos;
final int termLen = terminator.length();
final int last = terminator.charAt(terminator.length() - 1);
int len = 0;
int total = 0;
StringBuilder consolidated = null;
while (lastChar >= 0) {
++len;
++total;
if (total >= termLen && lastChar == last) {
boolean cont = false;
for (int i = termLen - 1; i >= 0; --i) {
final int bp = bufferOffset + 1 - termLen + i;
if (bp < 0) {
if (consolidated == null) throw new IllegalStateException("Bad line consolidation");
if (consolidated.charAt(consolidated.length() + bp) != terminator.charAt(i)) {
cont = true;
break;
}
} else if (buffer[bp] != terminator.charAt(i)) {
cont = true;
break;
}
}
if (!cont) {
len -= terminator.length();
break;
}
}
if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
if (consolidated == null) {
consolidated = new StringBuilder();
}
consolidated.append(buffer, startOffset, len);
startOffset = 0;
len = 0;
}
readNextChar();
}
if (lastChar <= 0 && !allowEof) {
throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
}
lastChar = 0;
if (consolidated != null) {
if (len > 0) {
consolidated.append(buffer, startOffset, len);
} else if (len < 0) {
consolidated.delete(consolidated.length() + len, consolidated.length());
}
return genericToken(consolidated.toString().toCharArray(),
0,
consolidated.length(),
type, startLineNo, startLinePos);
} else if (len > 0) {
return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
}
return null;
}
// --- Object ---
@Override
public String toString() {
return getClass().getSimpleName() + "{preLoaded=" + preLoaded + "}";
}
private T readUntil(char terminator, TT type, boolean allowEof) throws IOException {
int startOffset = bufferOffset;
int startLineNo = lineNo;
int startLinePos = linePos;
int len = 0;
StringBuilder consolidated = null;
while (lastChar >= 0 && lastChar != terminator) {
++len;
if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
if (consolidated == null) {
consolidated = new StringBuilder();
}
consolidated.append(buffer, startOffset, len);
startOffset = 0;
len = 0;
}
readNextChar();
}
if (lastChar < 0 && !allowEof) {
throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
}
lastChar = 0;
if (consolidated != null) {
if (len > 0) {
consolidated.append(buffer, startOffset, len);
}
return genericToken(consolidated.toString().toCharArray(),
0,
consolidated.length(),
type, startLineNo, startLinePos);
} else if (len > 0) {
return genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
}
return null;
}
@Override
public T parseNextToken() throws IOException {
while (lastChar >= 0) {
if (lastChar == 0) {
if (!readNextChar()) {
break;
}
}
if (isWhitespace()) {
// ignore
lastChar = 0;
} else if (startIdentifier()) {
return nextIdentifier();
} else if (startString()) {
return nextString(lastChar);
} else if (startNumber()) {
return nextNumber();
} else if (startSymbol()) {
return nextSymbol();
} else if (!maybeConsumeSilent()) {
// non-allowed characters not starting a valid token.
T token = symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
throw failure(token, "Unknown token initiator '%s'", javaEscape((char) lastChar));
}
}
return null;
}
// --- INTERNAL ---
private T nextIdentifier() throws IOException {
maybeConsolidateBuffer();
int startPos = linePos;
int startOffset = bufferOffset;
int startLine = lineNo;
int len = 1;
int lastLast = lastChar;
if (!readNextChar()) {
return identifierToken(buffer, startOffset, len, startLine, startPos);
}
while (allowIdentifier(lastChar)) {
++len;
if (lastChar == lastLast && identifierSeparator(lastLast)) {
T token = identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier with double '.'");
} else if (identifierSeparator(lastLast) && !startIdentifier()) {
T token = identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier part with invalid start '%c'", lastChar);
}
lastLast = lastChar;
if (!readNextChar()) {
break;
}
}
if (identifierSeparator(lastLast)) {
T token = identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier with trailing '%c'", (char) lastLast);
}
return identifierToken(buffer, startOffset, len, startLine, startPos);
}
private T nextNumber() throws IOException {
maybeConsolidateBuffer();
// NOTE: This code is pretty messy because it is a full state-engine
// to ensure that the parsed number follows the JSON number syntax.
// Alternatives are:
//
// dec = -?0
// dec = -?.0
// dec = -?0.0
// sci = (dec)[eE][+-]?[0-9]+
// hex = 0x[0-9a-fA-F]+
// oct = 0[0-7]+
//
// It is programmed as a state-engine to be very efficient, but
// correctly detect valid JSON (and what is invalid if not).
int startLine = lineNo;
int startPos = linePos;
int startOffset = bufferOffset;
// number (any type).
int len = 0;
if (lastChar == '-') {
// only base 10 decimals can be negative.
++len;
if (!readNextChar()) {
T token = numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Negative indicator without number");
}
if (!(lastChar == '.' || (lastChar >= '0' && lastChar <= '9'))) {
T token = numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "No decimal after negative indicator");
}
} else if (lastChar == '0') {
if (readNextChar()) {
++len;
if (lastChar == 'x') {
++len;
if (!readNextChar()) {
T token = numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "No decimal after hex indicator");
}
// hexadecimal.
do {
if (!((lastChar >= '0' && lastChar <= '9') || (lastChar >= 'a' && lastChar <= 'f') ||
(lastChar >= 'A' && lastChar <= 'F'))) {
// we read a char that's *not* part of the hex number.
break;
}
++len;
} while (readNextChar());
return validateAfterNumber(startOffset, startPos, len);
} else if ('0' <= lastChar && lastChar <= '7') {
++len;
// Octals have 0 in front, and then more digits.
while (readNextChar()) {
if (lastChar < '0' || lastChar > '7') {
break;
}
++len;
}
return validateAfterNumber(startOffset, startPos, len);
}
} else {
// just '0'
return validateAfterNumber(startOffset, startPos, 1);
}
}
// decimal part.
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
// fraction part.
if (lastChar == '.') {
++len;
// numbers are terminated by first non-numeric character.
if (readNextChar()) {
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
}
} else if (allowAfterInteger(lastChar) ||
allowAfterFloatingPoint(lastChar)) {
++len;
// ignore the actual char, but move the 'last' so we can check the char after.
readNextChar();
return validateAfterNumber(startOffset, startPos, len);
}
// exponent part.
if (lastChar == 'e' || lastChar == 'E') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
T token = numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token,
"Badly terminated number exponent: '%s'", token);
}
// The exponent can be explicitly prefixed with both '+'
// and '-'.
if (lastChar == '-' || lastChar == '+') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
T token = numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Badly terminated number exponent: '%s'", token);
}
}
if (lastChar >= '0' && lastChar <= '9') {
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
} else {
T token = numberToken(buffer, startOffset, len + 1, startLine, startPos);
throw failure(token, "Badly terminated number exponent: '%s'", token);
}
}
if (allowAfterFloatingPoint(lastChar)) {
++len;
// ignore the actual char, but move the 'last' so we can check the char after.
readNextChar();
}
return validateAfterNumber(startOffset, startPos, len);
}
private T validateAfterNumber(int startOffset, int startLinePos, int len)
throws IOException {
// A number must be terminated correctly. must not immediately start identifier or string.
if (lastChar > 0 && (startIdentifier() || startString() || startNumber())) {
T token = numberToken(buffer, startOffset, len + 1, lineNo, startLinePos);
throw failure(token, "Invalid termination of number: '%s'", token);
} else {
return numberToken(buffer, startOffset, len, lineNo, startLinePos);
}
}
private T nextString(int quote) throws IOException {
maybeConsolidateBuffer();
// strings may be longer than 128 bytes. We may need to build it.
StringBuilder consolidatedString = null;
int startLine = lineNo;
int startPos = linePos;
int startOffset = bufferOffset;
boolean esc = false;
for (; ; ) {
if (!preLoaded && !bufferLineEnd && bufferOffset >= (bufferLimit - 1)) {
if (consolidatedString == null) {
consolidatedString = new StringBuilder();
}
consolidatedString.append(buffer, startOffset, bufferOffset - startOffset + 1);
startOffset = 0;
}
if (!readNextChar()) {
throw eofFailure("Unexpected end of stream in string");
}
if (esc) {
esc = false;
} else if (lastChar == '\\') {
esc = true;
} else if (lastChar == quote) {
break;
} else if (lastChar == '\n' || lastChar == '\r') {
T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
throw failure(token, "Unexpected newline in string");
} else if (lastChar < 0x20 || lastChar == 0x7f ||
(lastChar > 0x7f && !ConsoleUtil.isConsolePrintable(lastChar))) {
T token = symbolToken(buffer, bufferOffset, 1, startLine, linePos);
throw failure(token, "Unescaped non-printable char in string: '%s'",
javaEscape((char) lastChar));
}
}
lastChar = 0;
if (consolidatedString != null) {
consolidatedString.append(buffer, 0, bufferOffset + 1);
String result = consolidatedString.toString();
return stringToken(result.toCharArray(), 0, result.length(), startLine, startPos);
} else {
return stringToken(buffer, startOffset, bufferOffset - startOffset + 1, startLine, startPos);
}
}
}