TokenClassifier.java

package net.morimekta.lexer;

public abstract class TokenClassifier {
    /**
     * Whitespace characters are generally ignored, and assumed to have no meaning.
     *
     * @param ch The character to classify.
     * @return If the last char is a whitespace.
     */
    public boolean isWhitespace(int ch) {
        return ch == '\n' ||
               ch == '\r' ||
               ch == '\t' ||
               ch == ' ';
    }

    // --------------------------------------------
    // ---------------- Identifier ----------------
    // --------------------------------------------

    /**
     * @param ch Character to check.
     * @return If the last character is the valid start of an identifier.
     */
    public boolean startIdentifier(int ch) {
        return '_' == ch ||
               ('a' <= ch && ch <= 'z') ||
               ('A' <= ch && ch <= 'Z');
    }

    /**
     * @param ch Character to check.
     * @return If the char matches a special character allowed only to initiate
     *         an identifier, *before* the normal identifier name rules apply.
     */
    public boolean allowBeforeIdentifier(int ch) {
        return false;
    }

    /**
     * @param ch  Character to check.
     * @param sep The separator character.
     * @return If the character is allowed to start any secondary identifiers.
     */
    public boolean startSecondaryIdentifier(int ch, int sep) {
        return startIdentifier(ch);
    }

    /**
     * @param ch Character to check.
     * @return If the character is allowed as part of an identifier.
     */
    public boolean allowIdentifier(int ch) {
        return '_' == ch ||
               ('a' <= ch && ch <= 'z') ||
               ('A' <= ch && ch <= 'Z') ||
               ('0' <= ch && ch <= '9');
    }

    /**
     * Rules for identifiers is to be a separator joined string of
     * allowed identifier chars. The separator cannot be directly repeated,
     * and the identifier cannot end with a separator. If multiple
     * separators are available, any two separators cannot come directly
     * after one another.
     *
     * @param ch Character to check.
     * @return If the character is separating parts of the identifier.
     */
    public boolean identifierSeparator(int ch) {
        return ch == '.';
    }

    /**
     * In case of double ID separator (e.g. '..' or '::'), or a separator
     * plus a non-ID character, should the identifier stop at the last OK
     * char and continue tokenizing after the ID as a new thing (likely a symbol)?
     *
     * @param sep Separator triggering the check.
     * @param ch  Character after the separator the is *not* a valid secondary
     *            ID character start.
     * @return True if the ID should be completed excluding the separator.
     */
    public boolean endIdentifier(int sep, int ch) {
        // E.g. in case of double ID separator or similar (".." after the ID).
        return false;
    }

    // ----------------------------------------
    // ---------------- Number ----------------
    // ----------------------------------------

    /**
     * @param ch Character to check.
     * @return If the last character is the valid start of a number.
     */
    public boolean startNumber(int ch) {
        return ch == '.' || ch == '-' || (ch >= '0' && ch <= '9');
    }

    /**
     * @param ch Character to check.
     * @return If the character is allowed as a type-marker for a integer.
     */
    public boolean allowAfterInteger(int ch) {
        return false;
    }

    /**
     * @param ch Character to check
     * @return If the character is allowed as a type-marker for a floating point
     * number. This is also checked after integers.
     */
    public boolean allowAfterFloatingPoint(int ch) {
        return false;
    }

    /**
     * @param ch Character to check.
     * @return If the character represent a sign before number.
     */
    public boolean numberSign(int ch) {
        return ch == '-';
    }

    /**
     * @param ch Character to check.
     * @return If the character indicates special radix encoding.
     */
    public boolean numberEncodingIndicator(int ch) {
        return ch == '0';
    }

    /**
     * @param ch Character to check.
     * @return If the character indicates decimal point separator.
     */
    public boolean numberDecimalSep(int ch) {
        return ch == '.';
    }

    /**
     * @param ch Character to check.
     * @return If the character indicates exponent separator.
     */
    public boolean numberExponentSep(int ch) {
        return ch == 'e' || ch == 'E';
    }

    /**
     * @param ch Character to check.
     * @return If the character is a sign indicator for exponent value.
     */
    public boolean numberExponentSign(int ch) {
        return ch == '-' || ch == '+';
    }

    // ----------------------------------------
    // ---------------- String ----------------
    // ----------------------------------------

    /**
     * @param ch Character to check.
     * @return If the last character is the valid start of a string.
     */
    public boolean startString(int ch) {
        return ch == '\"';
    }

    /**
     * E.g. can check for a single character at the end of the string literal
     * and include that as part of the token. E.g. <code>'10101001'b</code>
     *
     * @param ch Character to check.
     * @return If the character is allowed as a post-string type indicator.
     */
    public boolean allowAfterString(int ch) {
        return false;
    }

    /**
     * E.g. can check for a single character at the end of the string literal
     * and include that as part of the token. E.g. <code>b'10101001'</code>.
     * Note that the char also has to be a valid identifier.
     *
     * @param ch Character to check.
     * @return If the character is allowed as a post-string type indicator.
     */
    public boolean allowBeforeString(int ch) {
        return false;
    }

    // ----------------------------------------
    // ---------------- Symbol ----------------
    // ----------------------------------------

    /**
     * @return If the last character is the valid start of a symbol.
     */
    public boolean startSymbol(int ch) {
        return 0x20 < ch && ch < 0x7F;
    }

    // ----------------------------------------------
    // ---------------- Line Comment ----------------
    // ----------------------------------------------

    /**
     * @return If the last character is the valid start of a symbol.
     */
    public boolean startLineComment(int ch) {
        return ch == '#';
    }
}