TokenizerBase.java
/*
* Copyright (c) 2015-2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.lexer;
import net.morimekta.strings.ConsoleUtil;
import net.morimekta.strings.io.LineBufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.EscapeUtil.javaEscape;
/**
* Base tokenizer used around providence. The base tokenizer supports the
* minimum of what all the tokenizer implementations should require, so that
* each will mostly expand support, not much reduce it.
*
* @param <TT> TokenType type.
* @param <T> Token type.
*/
public abstract class TokenizerBase<TT extends TokenType, T extends Token<TT>>
extends LineBufferedReader
implements Tokenizer<TT, T> {
protected final TokenClassifier classifier;
protected final TokenFactory<TT, T> factory;
private final List<T> skipped = new ArrayList<>();
/**
* Default buffer size, if not specified: 2048 chars / 4 kB.
*/
public static final int DEFAULT_BUFFER_SIZE = 1 << 11; // 2048 chars --> 4kB
/**
* Create a tokenizer instance.
*
* @param classifier Token classifier.
* @param factory Token factory.
* @param in Reader to read.
* @param bufferSize The size of the read-buffer.
* @param preLoadAll If the whole content of the reader should be pre-loaded.
* If this is true, then all token content can be considered
* immutable.
*/
protected TokenizerBase(TokenClassifier classifier,
TokenFactory<TT, T> factory,
Reader in,
int bufferSize,
boolean preLoadAll) {
super(in, bufferSize, preLoadAll);
this.classifier = classifier;
this.factory = factory;
// If the line is longer than 16k, it will not be used in error messages.
}
/**
* Creates a lexer exception from a token with a message. This is an overridable
* method in order to be able to change the exception class from the get-go in
* the tokenizer.
*
* @param token The token to cause the exception.
* @param message The exception message.
* @param params If provided will be used with {@link String#format(String, Object...)} to
* make a readable exception message.
* @return The resulting exception.
*/
public LexerException failure(T token,
String message,
Object... params) {
if (params.length > 0) {
message = String.format(message, params);
}
return new LexerException(token, message);
}
/**
* Creates an EOF lexer exception from a token with a message. This is an overridable
* method in order to be able to change the exception class from the get-go in
* the tokenizer. This is similar to {@link #failure(Token, String, Object...)} but
* does not provide a token, and assumes the tokenizer is at the end of the stream.
*
* @param message The exception message.
* @param params If provided will be used with {@link String#format(String, Object...)} to
* make a readable exception message.
* @return The resulting exception.
*/
protected LexerException eofFailure(String message,
Object... params) {
if (params.length > 0) {
message = String.format(message, params);
}
return new LexerException(currentLine(), currentLineNo(), currentLinePos(), 1, message);
}
// --- CONSUME ---
/**
* This method is called when all other start-of-token checks have
* been met and failed for any non-whitespace char.
*
* @return True if the character, or any number of characters was
* consumed. False if nothing was consumed. This will result
* in an unknown start of token exception.
*/
protected boolean maybeConsumeSilent(T token) {
// Cheap way to consume simple comments.
return token.type == factory.lineCommentTokenType();
}
public boolean hasSkippedTokens() {
return !skipped.isEmpty();
}
public List<T> getSkippedTokens() {
return List.copyOf(skipped);
}
public List<T> clearSkippedTokens() {
try {
return List.copyOf(skipped);
} finally {
skipped.clear();
}
}
// --- Tokenizer ---
@Override
public int currentLineNo() {
return this.lineNo;
}
@Override
public int currentLinePos() {
return this.linePos;
}
@Override
public CharSequence currentLine() {
return getLine();
}
@Override
public T readUntil(CharSequence terminator, TT type, boolean allowEof) throws IOException {
requireNonNull(terminator, "terminator == null");
requireNonNull(type, "type == null");
maybeConsolidateBuffer();
if (lastChar == 0) {
readNextChar();
}
if (terminator.length() == 1) {
return readUntil(terminator.charAt(0), type, allowEof);
}
int startOffset = bufferOffset;
final int startLineNo = lineNo;
final int startLinePos = linePos;
final int termLen = terminator.length();
final int last = terminator.charAt(terminator.length() - 1);
int len = 0;
int total = 0;
StringBuilder consolidated = null;
while (lastChar >= 0) {
++len;
++total;
if (total >= termLen && lastChar == last) {
boolean cont = false;
for (int i = termLen - 1; i >= 0; --i) {
final int bp = bufferOffset + 1 - termLen + i;
if (bp < 0) {
if (consolidated == null) throw new IllegalStateException("Bad line consolidation");
if (consolidated.charAt(consolidated.length() + bp) != terminator.charAt(i)) {
cont = true;
break;
}
} else if (buffer[bp] != terminator.charAt(i)) {
cont = true;
break;
}
}
if (!cont) {
len -= terminator.length();
break;
}
}
if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
if (consolidated == null) {
consolidated = new StringBuilder();
}
consolidated.append(buffer, startOffset, len);
startOffset = 0;
len = 0;
}
readNextChar();
}
if (lastChar <= 0 && !allowEof) {
throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
}
lastChar = 0;
if (consolidated != null) {
if (len > 0) {
consolidated.append(buffer, startOffset, len);
} else if (len < 0) {
consolidated.delete(consolidated.length() + len, consolidated.length());
}
return factory.genericToken(consolidated.toString().toCharArray(),
0,
consolidated.length(),
type, startLineNo, startLinePos);
} else if (len > 0) {
return factory.genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
}
return null;
}
// --- Object ---
@Override
public String toString() {
return getClass().getSimpleName() + "{preLoaded=" + preLoaded + "}";
}
private T readUntil(char terminator, TT type, boolean allowEof) throws IOException {
int startOffset = bufferOffset;
int startLineNo = lineNo;
int startLinePos = linePos;
int len = 0;
StringBuilder consolidated = null;
while (lastChar >= 0 && lastChar != terminator) {
++len;
if (!preLoaded && bufferOffset == (bufferLimit - 1)) {
if (consolidated == null) {
consolidated = new StringBuilder();
}
consolidated.append(buffer, startOffset, len);
startOffset = 0;
len = 0;
}
readNextChar();
}
if (lastChar < 0 && !allowEof) {
throw eofFailure("End of file while reading until '%s'", javaEscape(terminator));
}
lastChar = 0;
if (consolidated != null) {
if (len > 0) {
consolidated.append(buffer, startOffset, len);
}
return factory.genericToken(
consolidated.toString().toCharArray(),
0,
consolidated.length(),
type, startLineNo, startLinePos);
} else if (len > 0) {
return factory.genericToken(buffer, startOffset, len, type, startLineNo, startLinePos);
}
return null;
}
@Override
public T parseNextToken() throws IOException {
var next = parseNextTokenInternal();
while (next != null && maybeConsumeSilent(next)) {
skipped.add(next);
next = parseNextTokenInternal();
}
return next;
}
private T parseNextTokenInternal() throws IOException {
while (lastChar >= 0) {
if (lastChar == 0) {
if (!readNextChar()) {
break;
}
}
if (classifier.isWhitespace(lastChar)) {
// ignore
lastChar = 0;
} else if (classifier.startIdentifier(lastChar)) {
return nextIdentifier(lastChar);
} else if (classifier.startString(lastChar)) {
return nextString(lastChar);
} else if (classifier.startNumber(lastChar)) {
return nextNumber();
} else if (classifier.startLineComment(lastChar)) {
return nextLineComment();
} else if (classifier.startSymbol(lastChar)) {
return nextSymbol();
} else {
// non-allowed characters not starting a valid token.
T token = factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
throw failure(token, "Unknown token initiator '%s'", javaEscape((char) lastChar));
}
}
return null;
}
// --- INTERNAL ---
/**
* @return Read the next identifier from buffer.
* @throws IOException If unable to read.
*/
protected T nextIdentifier(int first) throws IOException {
maybeConsolidateBuffer();
int startPos = linePos;
int startOffset = bufferOffset;
int startLine = lineNo;
int len = 1;
int lastLast;
if (first != lastChar) {
lastLast = first;
--len;
} else {
lastLast = lastChar;
if (!readNextChar()) {
return factory.identifierToken(buffer, startOffset, len, startLine, startPos);
}
}
if (classifier.allowBeforeString(lastLast) &&
classifier.startString(lastChar)) {
return nextString(lastLast);
}
while (classifier.allowIdentifier(lastChar) || classifier.identifierSeparator(lastChar)) {
++len;
if (classifier.identifierSeparator(lastLast) &&
!classifier.startSecondaryIdentifier(lastChar, lastLast)) {
if (classifier.endIdentifier(lastLast, lastChar)) {
T token = factory.identifierToken(buffer, startOffset, len - 2, startLine, startPos);
// rewind a single read.
this.lastChar = lastLast;
--this.bufferOffset;
--this.linePos;
return token;
} else if (classifier.identifierSeparator(lastChar)) {
T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier with double separator '%c%c'", lastLast, lastChar);
} else {
T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier part with invalid start '%c'", lastChar);
}
}
lastLast = lastChar;
if (!readNextChar()) {
break;
}
}
if (classifier.identifierSeparator(lastLast)) {
T token = factory.identifierToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Identifier with trailing '%c'", (char) lastLast);
}
return factory.identifierToken(buffer, startOffset, len, startLine, startPos);
}
protected T nextLineComment() throws IOException {
return readUntil('\n', factory.lineCommentTokenType(), true);
}
protected T nextNumber() throws IOException {
maybeConsolidateBuffer();
// NOTE: This code is pretty messy because it is a full state-engine
// to ensure that the parsed number follows the JSON number syntax.
// Alternatives are:
//
// dec = -?0
// dec = -?.0
// dec = -?0.0
// sci = (dec)[eE][+-]?[0-9]+
// hex = 0x[0-9a-fA-F]+
// oct = 0[0-7]+
//
// It is programmed as a state-engine to be very efficient, but
// correctly detect valid JSON (and what is invalid if not).
int startLine = lineNo;
int startPos = linePos;
int startOffset = bufferOffset;
// number (any type).
int len = 0;
if (classifier.numberSign(lastChar)) {
// only base 10 decimals can be negative.
++len;
if (!readNextChar()) {
T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Negative indicator without number");
}
if (!(lastChar == '.' || (lastChar >= '0' && lastChar <= '9'))) {
T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "No decimal after negative indicator");
}
} else if (classifier.numberEncodingIndicator(lastChar)) {
if (readNextChar()) {
++len;
if (lastChar == 'x') {
++len;
if (!readNextChar()) {
T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "No decimal after hex indicator");
}
// hexadecimal.
do {
if (!((lastChar >= '0' && lastChar <= '9') || (lastChar >= 'a' && lastChar <= 'f') ||
(lastChar >= 'A' && lastChar <= 'F'))) {
// we read a char that's *not* part of the hex number.
break;
}
++len;
} while (readNextChar());
return validateAfterNumber(startOffset, startPos, len);
} else if ('0' <= lastChar && lastChar <= '7') {
++len;
// Octals have 0 in front, and then more digits.
while (readNextChar()) {
if (lastChar < '0' || lastChar > '7') {
break;
}
++len;
}
return validateAfterNumber(startOffset, startPos, len);
}
} else {
// just '0'
return validateAfterNumber(startOffset, startPos, 1);
}
}
// decimal part.
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
// fraction part.
if (classifier.numberDecimalSep(lastChar)) {
++len;
// numbers are terminated by first non-numeric character.
if (readNextChar()) {
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
}
} else if (classifier.allowAfterInteger(lastChar) ||
classifier.allowAfterFloatingPoint(lastChar)) {
++len;
// ignore the actual char, but move the 'last' so we can check the char after.
readNextChar();
return validateAfterNumber(startOffset, startPos, len);
}
// exponent part.
if (classifier.numberExponentSep(lastChar)) {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token,
"Badly terminated number exponent: '%s'", token);
}
// The exponent can be explicitly prefixed with both '+' and '-'.
if (classifier.numberExponentSign(lastChar)) {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
T token = factory.numberToken(buffer, startOffset, len, startLine, startPos);
throw failure(token, "Badly terminated number exponent: '%s'", token);
}
}
if (lastChar >= '0' && lastChar <= '9') {
while (lastChar >= '0' && lastChar <= '9') {
++len;
// numbers are terminated by first non-numeric character.
if (!readNextChar()) {
break;
}
}
} else {
T token = factory.numberToken(buffer, startOffset, len + 1, startLine, startPos);
throw failure(token, "Badly terminated number exponent: '%s'", token);
}
}
if (classifier.allowAfterFloatingPoint(lastChar)) {
++len;
// ignore the actual char, but move the 'last' so we can check the char after.
readNextChar();
}
return validateAfterNumber(startOffset, startPos, len);
}
private T validateAfterNumber(int startOffset, int startLinePos, int len)
throws IOException {
// A number must be terminated correctly. must not immediately start identifier or string.
if (classifier.startIdentifier(lastChar) ||
classifier.startString(lastChar) ||
classifier.startNumber(lastChar)) {
T token = factory.numberToken(buffer, startOffset, len + 1, lineNo, startLinePos);
throw failure(token, "Invalid termination of number: '%s'", token);
} else {
return factory.numberToken(buffer, startOffset, len, lineNo, startLinePos);
}
}
private T nextString(int quote) throws IOException {
maybeConsolidateBuffer();
// strings may be longer than 128 bytes. We may need to build it.
StringBuilder consolidatedString = null;
int startLine = lineNo;
int startPos = linePos;
int startOffset = bufferOffset;
int endOffset = 0;
if (!classifier.startString(quote)) {
--startPos;
--startOffset;
quote = lastChar;
}
boolean esc = false;
for (; ; ) {
if (!preLoaded && !bufferLineEnd && bufferOffset >= (bufferLimit - 1)) {
if (consolidatedString == null) {
consolidatedString = new StringBuilder();
}
consolidatedString.append(buffer, startOffset, bufferOffset - startOffset + 1);
startOffset = 0;
}
if (!readNextChar()) {
throw eofFailure("Unexpected end of stream in string");
}
if (esc) {
esc = false;
} else if (lastChar == '\\') {
esc = true;
} else if (lastChar == quote) {
if (readNextChar()) {
if (classifier.allowAfterString(lastChar)) {
if (readNextChar()) {
if (classifier.startIdentifier(lastChar) || classifier.startString(lastChar)) {
T token = factory.identifierToken(buffer, bufferOffset, 1, startLine, linePos);
throw failure(token, "Unexpected token after string marker: '%s'", token);
}
}
}
}
break;
} else if (lastChar == '\n' || lastChar == '\r') {
T token = factory.symbolToken(buffer, bufferOffset, 1, startLine, linePos);
throw failure(token, "Unexpected newline in string");
} else if (lastChar < 0x20 || lastChar == 0x7f ||
(lastChar > 0x7f && !ConsoleUtil.isConsolePrintable(lastChar))) {
T token = factory.symbolToken(buffer, bufferOffset, 1, startLine, linePos);
throw failure(token, "Unescaped non-printable char in string: '%s'",
javaEscape((char) lastChar));
}
}
if (consolidatedString != null) {
consolidatedString.append(buffer, 0, bufferOffset + endOffset);
String result = consolidatedString.toString();
return factory.stringToken(result.toCharArray(), 0, result.length(), startLine, startPos);
} else {
return factory.stringToken(buffer,
startOffset,
(bufferOffset - startOffset) + endOffset,
startLine,
startPos);
}
}
/**
* This method is protected so that it can be overridden to
* produce multi-symbol tokens if necessary.
*
* @return Symbol token.
* @throws IOException If unable to read token.
*/
protected T nextSymbol() throws IOException {
if (classifier.allowBeforeIdentifier(lastChar)) {
var first = lastChar;
if (!readNextChar()) {
factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
}
if (classifier.startIdentifier(lastChar)) {
--linePos;
--bufferOffset;
return nextIdentifier(first);
}
return factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
}
lastChar = 0;
return factory.symbolToken(buffer, bufferOffset, 1, lineNo, linePos);
}
}