Lexer.java

/*
 * Copyright (c) 2015-2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.lexer;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Iterator;
import java.util.function.Predicate;

import static net.morimekta.strings.EscapeUtil.javaEscape;

/**
 * Base lexer class with helper methods that does not need to be
 * implemented. The base lexer should be able to continuously return
 * tokens until end of the stream, or the lexer process fails.
 *
 * @param <TT> The token-type enum class.
 * @param <T> The token implementation class.
 */
public class Lexer<TT extends TokenType, T extends Token<TT>> implements Iterable<T> {
    /**
     * Create a lexer instance using a specific tokenizer.
     *
     * @param tokenizer The tokenizer to be used to get tokens.
     */
    protected Lexer(Tokenizer<TT, T> tokenizer) {
        this.tokenizer = tokenizer;
    }

    /**
     * Make a lexing / parsing failure exception.
     *
     * @param line The line for the failure.
     * @param lineNo The line no for the failure.
     * @param linePos The line pos for the failure.
     * @param message The message for the failure.
     * @param args Arguments to format message.
     * @return The failure exception.
     */
    protected LexerException eofFailure(CharSequence line, int lineNo, int linePos, String message, Object... args) {
        if (args.length > 0) {
            message = String.format(message, args);
        }
        return new LexerException(line, lineNo, linePos, 1, message);
    }

    /**
     * Make a lexing / parsing failure exception.
     *
     * @param token The token causing the failure.
     * @param message The message for the failure.
     * @param args Arguments for formatting message.
     * @return The failure exception.
     */
    public LexerException failure(T token, String message, Object... args) {
        if (args.length > 0) {
            message = String.format(message, args);
        }
        return new LexerException(token, message);
    }

    /**
     * Consume and return the next token. This should not
     * trigger parsing anything after this token.
     *
     * @return The next token, or null if it's end of the stream.
     * @throws LexerException If parsing token failed.
     * @throws IOException If reading failed.
     */
    public T next() throws LexerException, IOException {
        // --- PRIVATE ---
        T lastToken;
        if (nextToken != null) {
            lastToken = nextToken;
            nextToken = null;
        } else {
            lastToken = tokenizer.parseNextToken();
        }
        return lastToken;
    }

    /**
     * Return true if there is a 'next' token. If this method returns
     * true, then 'peek' must return non-null until otherwise modified, and
     * the next call to 'next' must return non-null.
     *
     * @return If there is a next token.
     * @throws LexerException If parsing token failed.
     * @throws IOException If reading failed.
     */
    public boolean hasNext() throws LexerException, IOException {
        return peek() != null;
    }

    /**
     * Return the token that will be returned by 'next', but do not
     * 'consume' it. If this method returns a non-null value, 'next'
     * must return the same value exactly once.
     *
     * @return The next token.
     * @throws LexerException If parsing token failed.
     * @throws IOException If reading failed.
     */
    public T peek() throws LexerException, IOException {
        if (nextToken == null) {
            nextToken = tokenizer.parseNextToken();
        }
        return nextToken;
    }

    /**
     * Peek the next token, and fail if the token is not present.
     *
     * @param what The exception message on failure.
     * @return The token to be the next.
     * @throws LexerException On parse errors.
     * @throws IOException If reading failed.
     */
    public T peek(String what) throws LexerException, IOException {
        T token = peek();
        if (token == null) {
            throw eofFailure(tokenizer.currentLine(),
                             tokenizer.currentLineNo(),
                             tokenizer.currentLinePos(),
                             "Expected %s, but got end of file",
                             what);
        }
        return token;
    }

    /**
     * Expect a new token, and fail there is no next token.
     *
     * @param what What is expected.
     * @return The next token.
     * @throws LexerException On parse errors or missing next token.
     * @throws IOException If reading failed.
     */
    public T expect(String what) throws LexerException, IOException {
        T next = next();
        if (next == null) {
            throw eofFailure(tokenizer.currentLine(),
                             tokenizer.currentLineNo(),
                             tokenizer.currentLinePos(),
                             "Expected %s, but got end of file",
                             what);
        }
        return next;
    }

    /**
     * Expect a new token, and fail if the token is not of the given token type.
     *
     * @param what The exception message on failure.
     * @param type The token type being expected.
     * @return The token to be the next.
     * @throws LexerException On parse errors or validation failures.
     * @throws IOException If reading failed.
     */
    public T expect(String what, TT type) throws LexerException, IOException {
        T token = expect(what);
        if (!token.type().equals(type)) {
            nextToken = token;
            throw failure(token, "Expected %s, but got '%s'", what, javaEscape(token));
        }
        return token;
    }

    /**
     * Expect a new token, and fail if the token does not validate.
     *
     * @param what The exception message on failure.
     * @param validator Validator to check on the token.
     * @return The token to be the next.
     * @throws LexerException On parse errors or validation failure.
     * @throws IOException If reading failed.
     */
    public T expect(String what, Predicate<T> validator) throws LexerException, IOException {
        T token = expect(what);
        if (!validator.test(token)) {
            nextToken = token;
            throw failure(token, "Expected %s, but got '%s'", what, javaEscape(token));
        }
        return token;
    }

    /**
     * @param what The exception message on failure.
     * @param symbols Symbols to be expected.
     * @return The token of the symbol.
     * @throws LexerException On parse errors or validation failure.
     * @throws IOException If unable to parse token, or not applicable symbol.
     */
    public T expectSymbol(String what, char... symbols) throws LexerException, IOException {
        return expect(what, t -> {
            if (t.length() != 1) {
                return false;
            }
            for (char s : symbols) {
                if (t.isSymbol(s)) {
                    return true;
                }
            }
            return false;
        });
    }

    /**
     * Read until termination string.
     *
     * @param term The termination string.
     * @param type The type of token to be generated.
     * @param allowEof If we allow end of file to termainate the token.
     * @return The read token if it has any size.
     * @throws LexerException On parse errors or validation failure.
     * @throws IOException If unable to parse token.
     */
    public T readUntil(String term, TT type, boolean allowEof) throws LexerException, IOException {
        nextToken = null;
        return tokenizer.readUntil(term, type, allowEof);
    }

    @Override
    public Iterator<T> iterator() {
        return new LexerIterator();
    }

    // --- PRIVATE ---

    private final Tokenizer<TT, T> tokenizer;
    private T nextToken;

    private class LexerIterator implements Iterator<T> {
        @Override
        public boolean hasNext() {
            try {
                return Lexer.this.hasNext();
            } catch (LexerException e) {
                throw new UncheckedLexerException(e);
            } catch (IOException e) {
                throw new UncheckedIOException(e.getMessage(), e);
            }
        }

        @Override
        public T next() {
            try {
                return Lexer.this.expect("anything");
            } catch (LexerException e) {
                throw new UncheckedLexerException(e);
            } catch (IOException e) {
                throw new UncheckedIOException(e.getMessage(), e);
            }
        }
    }

    @Override
    public String toString() {
        return getClass().getSimpleName() + "{tokenizer=" + tokenizer + ", next=" + nextToken + "}";
    }
}