Tokenizer.java

/*
 * Copyright (c) 2015-2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.lexer;

import java.io.IOException;
import java.util.List;

/**
 * Interface for a tokenizer.
 *
 * @param <TT> The token type generic type.
 * @param <T>  The token instance type.
 */
public interface Tokenizer<TT, T extends Token<TT>> {
    /**
     * Continue parsing content and return the next token to be found.
     *
     * @return The next token, or null if there is none.
     * @throws LexerException If parsing token failed.
     * @throws IOException    If reading failed.
     */
    T parseNextToken() throws LexerException, IOException;

    /**
     * @return True if the tokenizer has accumulated any skipped tokens
     * since the last call to {@link #clearSkippedTokens()}.
     */
    boolean hasSkippedTokens();

    /**
     * @return An immutable copy of the currently accumulated skipped tokens.
     */
    List<T> getSkippedTokens();

    /**
     * Return and clear the accumulated skipped tokens.
     *
     * @return An immutable copy of the skipped tokens before clearing.
     */
    List<T> clearSkippedTokens();

    /**
     * Read all content until the given terminator string is encountered. The
     * terminator should <b>not</b> become part of the returned token. The
     * returned sequence may span more than one line.
     *
     * @param terminator The terminator string.
     * @param type       The token type for the resulting token.
     * @param allowEof   Set to true if EOF is allowed as a replacement for the terminator.
     * @return The char sequence from the current position until the encountered
     * terminator or the end of the file. Or null if no chars before the terminator.
     * @throws LexerException On parse errors or validation failure.
     * @throws IOException    If unable to parse token.
     */
    T readUntil(CharSequence terminator,
                TT type,
                boolean allowEof) throws LexerException, IOException;

    /**
     * Get the current line number. Note that the lines should be 1-indexed, meaning
     * the first line (before the first newline) is line no 1. This should point to
     * the position where it will continue parsing, any previous tokens should be
     * represented by itself.
     *
     * @return The current line number.
     */
    int currentLineNo();

    /**
     * Get the current line position. This should point to the char position of the
     * current line, meaning it is 0-indexed. This should point to
     * the position where it will continue parsing, any previous tokens should be
     * represented by itself.
     *
     * @return The current line position.
     */
    int currentLinePos();

    /**
     * @return Content of the current line. The char-sequence must be effectively
     * immutable.
     */
    CharSequence currentLine();

    /**
     * If the specified token should be skipped and collected as a 'skipped' token
     * instead.
     *
     * @param token The token to check.
     * @return If the token should be skipped.
     */
    default boolean skipTokenOnParseNext(T token) {
        return false;
    }
}