TokenClassifier.java
package net.morimekta.lexer;
public abstract class TokenClassifier {
/**
* Whitespace characters are generally ignored, and assumed to have no meaning.
*
* @param ch The character to classify.
* @return If the last char is a whitespace.
*/
public boolean isWhitespace(int ch) {
return ch == '\n' ||
ch == '\r' ||
ch == '\t' ||
ch == ' ';
}
// --------------------------------------------
// ---------------- Identifier ----------------
// --------------------------------------------
/**
* @param ch Character to check.
* @return If the last character is the valid start of an identifier.
*/
public boolean startIdentifier(int ch) {
return '_' == ch ||
('a' <= ch && ch <= 'z') ||
('A' <= ch && ch <= 'Z');
}
/**
* @param ch Character to check.
* @return If the char matches a special character allowed only to initiate
* an identifier, *before* the normal identifier name rules apply.
*/
public boolean allowBeforeIdentifier(int ch) {
return false;
}
/**
* @param ch Character to check.
* @param sep The separator character.
* @return If the character is allowed to start any secondary identifiers.
*/
public boolean startSecondaryIdentifier(int ch, int sep) {
return startIdentifier(ch);
}
/**
* @param ch Character to check.
* @return If the character is allowed as part of an identifier.
*/
public boolean allowIdentifier(int ch) {
return '_' == ch ||
('a' <= ch && ch <= 'z') ||
('A' <= ch && ch <= 'Z') ||
('0' <= ch && ch <= '9');
}
/**
* Rules for identifiers is to be a separator joined string of
* allowed identifier chars. The separator cannot be directly repeated,
* and the identifier cannot end with a separator. If multiple
* separators are available, any two separators cannot come directly
* after one another.
*
* @param ch Character to check.
* @return If the character is separating parts of the identifier.
*/
public boolean identifierSeparator(int ch) {
return ch == '.';
}
/**
* In case of double ID separator (e.g. '..' or '::'), or a separator
* plus a non-ID character, should the identifier stop at the last OK
* char and continue tokenizing after the ID as a new thing (likely a symbol)?
*
* @param sep Separator triggering the check.
* @param ch Character after the separator the is *not* a valid secondary
* ID character start.
* @return True if the ID should be completed excluding the separator.
*/
public boolean endIdentifier(int sep, int ch) {
// E.g. in case of double ID separator or similar (".." after the ID).
return false;
}
// ----------------------------------------
// ---------------- Number ----------------
// ----------------------------------------
/**
* @param ch Character to check.
* @return If the last character is the valid start of a number.
*/
public boolean startNumber(int ch) {
return ch == '.' || ch == '-' || (ch >= '0' && ch <= '9');
}
/**
* @param ch Character to check.
* @return If the character is allowed as a type-marker for a integer.
*/
public boolean allowAfterInteger(int ch) {
return false;
}
/**
* @param ch Character to check
* @return If the character is allowed as a type-marker for a floating point
* number. This is also checked after integers.
*/
public boolean allowAfterFloatingPoint(int ch) {
return false;
}
/**
* @param ch Character to check.
* @return If the character represent a sign before number.
*/
public boolean numberSign(int ch) {
return ch == '-';
}
/**
* @param ch Character to check.
* @return If the character indicates special radix encoding.
*/
public boolean numberEncodingIndicator(int ch) {
return ch == '0';
}
/**
* @param ch Character to check.
* @return If the character indicates decimal point separator.
*/
public boolean numberDecimalSep(int ch) {
return ch == '.';
}
/**
* @param ch Character to check.
* @return If the character indicates exponent separator.
*/
public boolean numberExponentSep(int ch) {
return ch == 'e' || ch == 'E';
}
/**
* @param ch Character to check.
* @return If the character is a sign indicator for exponent value.
*/
public boolean numberExponentSign(int ch) {
return ch == '-' || ch == '+';
}
// ----------------------------------------
// ---------------- String ----------------
// ----------------------------------------
/**
* @param ch Character to check.
* @return If the last character is the valid start of a string.
*/
public boolean startString(int ch) {
return ch == '\"';
}
/**
* E.g. can check for a single character at the end of the string literal
* and include that as part of the token. E.g. <code>'10101001'b</code>
*
* @param ch Character to check.
* @return If the character is allowed as a post-string type indicator.
*/
public boolean allowAfterString(int ch) {
return false;
}
/**
* E.g. can check for a single character at the end of the string literal
* and include that as part of the token. E.g. <code>b'10101001'</code>.
* Note that the char also has to be a valid identifier.
*
* @param ch Character to check.
* @return If the character is allowed as a post-string type indicator.
*/
public boolean allowBeforeString(int ch) {
return false;
}
// ----------------------------------------
// ---------------- Symbol ----------------
// ----------------------------------------
/**
* @return If the last character is the valid start of a symbol.
*/
public boolean startSymbol(int ch) {
return 0x20 < ch && ch < 0x7F;
}
// ----------------------------------------------
// ---------------- Line Comment ----------------
// ----------------------------------------------
/**
* @return If the last character is the valid start of a symbol.
*/
public boolean startLineComment(int ch) {
return ch == '#';
}
}