Unicode.java
/*
* Copyright (c) 2016, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.chr;
import java.util.Objects;
import static java.lang.Character.isBmpCodePoint;
import static net.morimekta.strings.ConsoleUtil.isConsolePrintable;
/**
* Representation of a unicode character. It represents the full 31-but unicode
* code point, and will expand to the 2 surrogate paired string if necessary.
*/
public class Unicode
implements Char {
/**
* No-break space.
*/
public static final Unicode NBSP = unicode(0x00A0);
/**
* sets base direction to LTR and isolates the embedded content from the surrounding text
*/
public static final Unicode LTR_ISOLATE = unicode(0x2066);
/**
* ditto, but for RTL
*/
public static final Unicode RTL_ISOLATE = unicode(0x2067);
/**
* isolates the content and sets the direction according to the first strongly typed directional character
*/
public static final Unicode FS_ISOLATE = unicode(0x2068);
/**
* sets base direction to LTR but allows embedded text to interact with surrounding content, so risk of spillover effects
*/
public static final Unicode LTR_EMBED = unicode(0x202A);
/**
* ditto, but for RTL
*/
public static final Unicode RTL_EMBED = unicode(0x202B);
/**
* overrides the bidirectional algorithm to display characters in memory order, progressing from left to right
*/
public static final Unicode LTR_OVERRIDE = unicode(0x202D);
/**
* as previous, but display progresses from right to left
*/
public static final Unicode RTL_OVERRIDE = unicode(0x202E);
/**
* Create a unicode instance.
*
* @param cp Code point of unicode character, can be BMP or extended.
* @return The unicode char.
*/
public static Unicode unicode(int cp) {
return new Unicode(cp);
}
/**
* Create a unicode instance.
*
* @param ch The unicode character (BMP).
* @return The unicode char.
*/
public static Unicode unicode(char ch) {
return new Unicode(ch);
}
private final int cp;
/**
* Create a unicode instance.
*
* @param cp Character code point.
*/
public Unicode(int cp) {
this.cp = cp;
}
/**
* Create a unicode instance.
*
* @param ch Unicode character.
*/
public Unicode(char ch) {
this.cp = ch;
}
@Override
public int codepoint() {
return cp;
}
@Override
public String asString() {
StringBuilder builder = new StringBuilder();
switch (cp) {
case NUL:
return "<NUL>";
case ABR:
return "<ABR>";
case EOF:
return "<EOF>";
case ENQ:
return "<ENQ>";
case ACK:
return "<ACK>";
case NAK:
return "<NAK>";
case BEL:
return "<BEL>"; // Not using '\a', as not universal.
case BS:
return "<BS>"; // Not using '\b'. It conflicts with C definition of BEL.
case VT:
return "<VT>";
case SS:
return "<SS>";
case SI:
return "<SI>";
case ESC:
return "<ESC>";
case CAN:
return "<CAN>";
case XON:
return "<XON>";
case XOFF:
return "<XOFF>";
case FS:
return "<FS>";
case GS:
return "<GS>";
case RS:
return "<RS>";
case US:
return "<US>";
case DEL:
return "<DEL>"; // backspace??
case 0x00A0:
return "<nbsp>";
case 0x2066:
return "<ltr-isolate>";
case 0x2067:
return "<rtl-isolate>";
case 0x2068:
return "<fs-isolate>";
case 0x202A:
return "<ltr-embedded>";
case 0x202B:
return "<rtl-embedded>";
case 0x202D:
return "<ltr-override>";
case 0x202E:
return "<rtl-override>";
case TAB:
builder.append('\\').append('t');
break;
case LF:
builder.append('\\').append('n');
break;
case FF:
builder.append('\\').append('f');
break;
case CR:
builder.append('\\').append('r');
break;
case '\"':
case '\'':
case '\\':
builder.append('\\').append((char) cp);
break;
default:
if (cp < 0x20) {
// Use escaped octal value encoding.
builder.append(String.format("\\%03o", cp));
} else if (isConsolePrintable(cp)) {
if (isBmpCodePoint(cp)) {
builder.append((char) cp);
} else {
builder.append(Character.highSurrogate(cp));
builder.append(Character.lowSurrogate(cp));
}
} else {
if (isBmpCodePoint(cp)) {
builder.append(String.format("\\u%04x", cp));
} else {
builder.append(String.format("\\u%04x", (int) Character.highSurrogate(cp)));
builder.append(String.format("\\u%04x", (int) Character.lowSurrogate(cp)));
}
}
break;
}
return builder.toString();
}
@Override
public int printableWidth() {
if (cp < 0x0300) {
// Control characters, accents, etc.
if (cp < 0x20 ||
(0x7F <= cp && cp < 0xA0)) {
return 0;
}
} else if (cp < 0x0600) {
if ((cp < 0x0370) || // 0-width punctuation
(0x0483 <= cp && cp <= 0x0489) || // 0-width symbols
(0x0591 <= cp && cp <= 0x05c7 && // 0-width sanskrit accents
cp != 0x05be && cp != 0x05c0 && cp != 0x05c3 && cp != 0x05c6)) {
return 0;
}
} else if (cp < 0x0800) {
if ((cp <= 0x0605) || // 0-width hebrew accents
(0x0610 <= cp && cp <= 0x061a) || cp == 0x061c || // 0-width arabic accents
cp == 0x064b ||
(0x064c <= cp && cp <= 0x065f) || cp == 0x0670 || // 0-width arabic accents (2)
(0x06d6 <= cp && cp <= 0x06ed && // 0-width symbols (2)
cp != 0x06de && cp != 0x06e5 && cp != 0x06e6 && cp != 0x06e9) ||
cp == 0x070f || cp == 0x0711 ||
(0x0730 <= cp && cp <= 0x074a) ||
(0x07a6 <= cp && cp <= 0x07b0) ||
(0x07eb <= cp && cp <= 0x07f3) ||
cp == 0x07fd) {
return 0;
}
} else if (cp < 0x1000) {
if ((0x0816 <= cp && cp <= 0x082d &&
cp != 0x081a && cp != 0x0824 && cp != 0x0828) ||
cp == 0x0859 || cp == 0x085a || cp == 0x085b ||
cp == 0x0890 || cp == 0x0891 ||
(0x0898 <= cp && cp <= 0x089f) ||
(0x08ca <= cp && cp <= 0x0902) ||
cp == 0x093a || cp == 0x093c ||
(0x0941 <= cp && cp <= 0x0948) || cp == 0x094d ||
(0x0951 <= cp && cp <= 0x0957) ||
cp == 0x0962 || cp == 0x0963 ||
cp == 0x0981 || cp == 0x09bc ||
(0x09c1 <= cp && cp <= 0x09c4) ||
cp == 0x09cd ||
cp == 0x09e2 || cp == 0x09e3 ||
cp == 0x09fe ||
cp == 0x0a01 || cp == 0x0a02 ||
cp == 0x0a3c ||
cp == 0x0a41 || cp == 0x0a42 ||
cp == 0x0a47 || cp == 0x0a48 ||
cp == 0x0a4b || cp == 0x0a4c || cp == 0x0a4d ||
cp == 0x0a51 ||
cp == 0x0a70 || cp == 0x0a71 ||
cp == 0x0a75 ||
cp == 0x0a81 || cp == 0x0a82 ||
cp == 0x0abc ||
(0x0ac1 <= cp && cp <= 0x0ac8 && cp != 0x0ac6) ||
cp == 0x0acd ||
cp == 0x0ae2 || cp == 0x0ae3 ||
(0x0afa <= cp && cp <= 0x0aff) ||
cp == 0x0b01 ||
cp == 0x0b3c ||
cp == 0x0b3f ||
cp == 0x0b41 || cp == 0x0b42 || cp == 0x0b43 || cp == 0x0b44 ||
cp == 0x0b4d ||
cp == 0x0b55 || cp == 0x0b56 ||
cp == 0x0b62 || cp == 0x0b63 ||
cp == 0x0b82 ||
cp == 0x0bc0 ||
cp == 0x0bcd ||
cp == 0x0c00 ||
cp == 0x0c04 ||
cp == 0x0c3c ||
cp == 0x0c3e || cp == 0x0c3f ||
cp == 0x0c40 ||
cp == 0x0c46 || cp == 0x0c47 || cp == 0x0c48 ||
cp == 0x0c4a || cp == 0x0c4b || cp == 0x0c4c || cp == 0x0c4d ||
cp == 0x0c55 || cp == 0x0c56 ||
cp == 0x0c62 || cp == 0x0c63 ||
cp == 0x0c81 ||
cp == 0x0cbc || cp == 0x0cbf ||
cp == 0x0cc6 ||
cp == 0x0ccc || cp == 0x0ccd ||
cp == 0x0ce2 || cp == 0x0ce3 ||
cp == 0x0d00 ||
cp == 0x0d01 ||
cp == 0x0d3b ||
cp == 0x0d3c ||
cp == 0x0d41 || cp == 0x0d42 || cp == 0x0d43 || cp == 0x0d44 ||
cp == 0x0d4d ||
cp == 0x0d62 || cp == 0x0d63 ||
cp == 0x0d81 ||
cp == 0x0dca ||
cp == 0x0dd2 || cp == 0x0dd3 ||
cp == 0x0dd4 || cp == 0x0dd6 ||
cp == 0x0e31 ||
(0x0e34 <= cp && cp <= 0x0e3a) ||
(0x0e47 <= cp && cp <= 0x0e4e) ||
cp == 0x0eb1 ||
(0x0eb4 <= cp && cp <= 0x0ebc) ||
(0x0ec8 <= cp && cp <= 0x0ece) ||
cp == 0x0f18 || cp == 0x0f19 ||
cp == 0x0f35 ||
cp == 0x0f37 ||
cp == 0x0f39 ||
(0x0f71 <= cp && cp <= 0x0f87 && cp != 0x0f7f && cp != 0x0f85) ||
(0x0f8d <= cp && cp <= 0x0fbc && cp != 0x0f98) ||
cp == 0x0fc6) {
return 0;
}
} else if (cp < 0x2000) {
if (cp < 0x1360) {
if (0x1100 <= cp && cp < 0x1160) {
return 2;
} else if ((0x102d <= cp && cp <= 0x1037 && cp != 0x1031) ||
cp == 0x1039 ||
cp == 0x103a ||
cp == 0x103d || cp == 0x103e ||
cp == 0x1058 || cp == 0x1059 ||
cp == 0x105e || cp == 0x105f ||
cp == 0x1060 ||
cp == 0x1071 || cp == 0x1072 || cp == 0x1073 || cp == 0x1074 ||
cp == 0x1082 ||
cp == 0x1085 || cp == 0x1086 ||
cp == 0x108d ||
cp == 0x109d ||
(0x1160 <= cp && cp <= 0x11ff) ||
cp == 0x135d || cp == 0x135e || cp == 0x135f) {
return 0;
}
} else if (0x1700 < cp) {
if ((0x1712 <= cp && cp <= 0x1714) ||
cp == 0x1732 || cp == 0x1733 ||
cp == 0x1752 || cp == 0x1753 ||
cp == 0x1772 || cp == 0x1773 ||
(0x17b4 <= cp && cp <= 0x17bd && cp != 0x17b6) ||
cp == 0x17c6 ||
(0x17c9 <= cp && cp <= 0x17d3) ||
cp == 0x17dd ||
(0x180b <= cp && cp <= 0x180f) ||
cp == 0x1885 || cp == 0x1886 ||
cp == 0x18a9 ||
(0x1920 <= cp && cp <= 0x1922) ||
cp == 0x1927 || cp == 0x1928 ||
cp == 0x1932 ||
cp == 0x1939 || cp == 0x193a || cp == 0x193b ||
cp == 0x1a17 || cp == 0x1a18 ||
cp == 0x1a1b ||
cp == 0x1a56 ||
(0x1a58 <= cp && cp <= 0x1a5e) ||
cp == 0x1a60 ||
cp == 0x1a62 ||
(0x1a65 <= cp && cp <= 0x1a6c) ||
(0x1a73 <= cp && cp <= 0x1a7c) ||
cp == 0x1a7f ||
(0x1ab0 <= cp && cp <= 0x1ace) ||
(0x1b00 <= cp && cp <= 0x1b03) ||
cp == 0x1b34 ||
(0x1b36 <= cp && cp <= 0x1b3a) ||
cp == 0x1b3c ||
cp == 0x1b42 ||
(0x1b6b <= cp && cp <= 0x1b73) ||
cp == 0x1b80 || cp == 0x1b81 ||
(0x1ba2 <= cp && cp <= 0x1ba5) ||
cp == 0x1ba8 || cp == 0x1ba9 ||
cp == 0x1bab || cp == 0x1bac || cp == 0x1bad ||
cp == 0x1be6 ||
cp == 0x1be8 || cp == 0x1be9 ||
cp == 0x1bed ||
cp == 0x1bef ||
cp == 0x1bf0 || cp == 0x1bf1 ||
(0x1c2c <= cp && cp <= 0x1c33) ||
cp == 0x1c36 || cp == 0x1c37 ||
(0x1cd0 <= cp && cp <= 0x1ce8 &&
cp != 0x1cd3 && cp != 0x1ce1) ||
cp == 0x1ced ||
cp == 0x1cf4 ||
cp == 0x1cf8 || cp == 0x1cf9 ||
(0x1dc0 <= cp && cp <= 0x1dfb) ||
(0x1dfc <= cp && cp <= 0x1dff)) {
return 0;
}
}
} else if (cp < 0x3000) {
if ((0x200b <= cp && cp <= 0x200f) ||
(0x202a <= cp && cp <= 0x202e) ||
(0x2060 <= cp && cp <= 0x206f && cp != 0x2065) ||
(0x20d0 <= cp && cp <= 0x20ef) ||
cp == 0x20f0 ||
cp == 0x2d7f ||
cp == 0x2cef || cp == 0x2cf0 || cp == 0x2cf1 ||
(0x2de0 <= cp && cp <= 0x2dff)) {
return 0;
} else if ((0x231a <= cp && cp < 0x231c) ||
(0x2329 <= cp && cp < 0x232b) ||
(0x23e9 <= cp && cp < 0x23ed) ||
cp == 0x23f0 ||
cp == 0x23f3 ||
(0x25fd <= cp && cp < 0x25ff) ||
(0x2614 <= cp && cp < 0x2616) ||
(0x2648 <= cp && cp <= 0x2653) ||
cp == 0x267f ||
cp == 0x2693 ||
cp == 0x26a1 ||
cp == 0x26aa || cp == 0x26ab ||
(0x26bd <= cp && cp <= 0x26be) ||
cp == 0x26c4 || cp == 0x26c5 ||
cp == 0x26ce ||
cp == 0x26d4 ||
cp == 0x26ea ||
cp == 0x26f2 || cp == 0x26f3 ||
cp == 0x26f5 || cp == 0x26fa || cp == 0x26fd ||
cp == 0x2705 || cp == 0x270a || cp == 0x270b ||
cp == 0x2728 ||
cp == 0x274c ||
cp == 0x274e ||
(0x2753 <= cp && cp <= 0x2755) || cp == 0x2757 ||
(0x2795 <= cp && cp <= 0x2797) ||
cp == 0x27b0 || cp == 0x27bf ||
cp == 0x2b1b || cp == 0x2b1c ||
cp == 0x2b50 || cp == 0x2b55 ||
(0x2e80 <= cp && cp < 0x2fd6) ||
(0x2ff0 <= cp && cp < 0x2ffc)) {
if (!(cp == 0x2e9a ||
(0x2ef4 <= cp && cp <= 0x2eff))) {
return 2;
}
}
} else if (cp <= 0x10000) {
// Control characters, accents, etc.
if (cp < 0x3030) {
if (cp == 0x302a ||
cp == 0x302b ||
cp == 0x302c ||
cp == 0x302d) {
return 0;
}
} else if (0xa660 < cp) {
if ((0xa66f <= cp && cp <= 0xa67d &&
cp != 0xa673) ||
cp == 0xa6f0 || cp == 0xa6f1 ||
cp == 0xa69e || cp == 0xa69f ||
cp == 0xa802 ||
cp == 0xa806 ||
cp == 0xa80b ||
cp == 0xa825 || cp == 0xa826 ||
cp == 0xa8c4 || cp == 0xa8c5 ||
(0xa8e0 <= cp && cp <= 0xa8f1) ||
cp == 0xa8ff ||
(0xa926 <= cp && cp <= 0xa92d) ||
(0xa947 <= cp && cp <= 0xa951) ||
(0xa980 <= cp && cp <= 0xa982) ||
cp == 0xa9b3 ||
(0xa9b6 <= cp && cp <= 0xa9b9) ||
cp == 0xa9bc || cp == 0xa9bd ||
cp == 0xa9e5 ||
(0xaa29 <= cp && cp <= 0xaa2e) ||
cp == 0xaa31 || cp == 0xaa32 ||
cp == 0xaa35 || cp == 0xaa36 ||
cp == 0xaa43 ||
cp == 0xaa4c ||
cp == 0xaa7c ||
cp == 0xaab0 ||
cp == 0xaab2 || cp == 0xaab3 || cp == 0xaab4 ||
cp == 0xaab7 || cp == 0xaab8 ||
cp == 0xaaec || cp == 0xaaed || cp == 0xaabe || cp == 0xaabf ||
cp == 0xaac1 ||
cp == 0xaaf6 ||
cp == 0xabe5 ||
cp == 0xabe8 ||
cp == 0xabed ||
(0xd7b0 <= cp && cp <= 0xd7ff) ||
// -- 0xd86f ... ud8a2 prints as if it was width 0, but setting
// it here has apparently no effect.
// (0xd86f <= cp && cp <= 0xd8a2) ||
(0xfe00 <= cp && cp <= 0xfe0f) ||
(0xfe20 <= cp && cp <= 0xfe2f) ||
cp == 0xfeff ||
(0xfff9 <= cp && cp <= 0xfffb)) {
return 0;
}
}
// CJK compatibility (square symbols), Extension A
if ((cp < 0x4dc0)) {
if (cp == 0x3097 ||
cp == 0x3099 ||
cp == 0x309a) {
return 0;
}
if (!(
cp == 0x3040 ||
(0x3097 <= cp && cp <= 0x309a) ||
(0x3100 <= cp && cp <= 0x3104) ||
cp == 0x303f ||
cp == 0x3130 ||
cp == 0x318f ||
(0x31e4 <= cp && cp <= 0x31ef) ||
cp == 0x321f ||
(0x3248 <= cp && cp <= 0x324f))) {
return 2;
}
} else if (
(0x4e00 <= cp && cp < 0xa4c7) || // CJK Main group of ideographs ↓ ↓
(0xa960 <= cp && cp < 0xa97d) ||
(0xac00 <= cp && cp < 0xd7a4) ||
(0xf900 <= cp && cp < 0xfb00) ||
(0xfe10 <= cp && cp < 0xfe1a) ||
(0xfe30 <= cp && cp < 0xfe6c) ||
(0xff01 <= cp && cp < 0xff61) ||
(0xffe0 <= cp && cp < 0xffe7)) {
// Some excluded chars and ranges.
if (!(
(0xa48d <= cp && cp <= 0xa48f) ||
cp == 0xfe53 ||
cp == 0xfe67)) {
// CJK or other double-width character.
return 2;
}
}
} else {
// Character.isIdeographic(cp) does not mean the same thing as 'double width'.
if ((0x00020000 <= cp && cp < 0x0002A6C0) || // CJK Extension B
(0x0002A700 <= cp && cp < 0x0002CEB0)) { // CJK Extension C, D, E
return 2;
}
}
return 1;
}
@Override
public int length() {
if (!isBmpCodePoint(cp)) {
return 2;
}
return 1;
}
@Override
public boolean equals(Object o) {
if (o == this) {
return true;
}
if (o == null || !(getClass().equals(o.getClass()))) {
return false;
}
return cp == ((Unicode) o).cp;
}
@Override
public String toString() {
if (!isBmpCodePoint(cp)) {
return new String(new char[]{
Character.highSurrogate(cp),
Character.lowSurrogate(cp)
});
}
return String.valueOf((char) cp);
}
@Override
public int hashCode() {
return Objects.hash(Unicode.class, cp);
}
@Override
public int compareTo(Char o) {
return Integer.compare(cp, o.codepoint());
}
}