Unicode.java

/*
 * Copyright (c) 2016, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings.chr;

import java.util.Objects;

import static java.lang.Character.isBmpCodePoint;
import static net.morimekta.strings.ConsoleUtil.isConsolePrintable;

/**
 * Representation of a unicode character. It represents the full 31-but unicode
 * code point, and will expand to the 2 surrogate paired string if necessary.
 */
public class Unicode
        implements Char {
    /**
     * No-break space.
     */
    public static final Unicode NBSP         = unicode(0x00A0);
    /**
     * sets base direction to LTR and isolates the embedded content from the surrounding text
     */
    public static final Unicode LTR_ISOLATE  = unicode(0x2066);
    /**
     * ditto, but for RTL
     */
    public static final Unicode RTL_ISOLATE  = unicode(0x2067);
    /**
     * isolates the content and sets the direction according to the first strongly typed directional character
     */
    public static final Unicode FS_ISOLATE   = unicode(0x2068);
    /**
     * sets base direction to LTR but allows embedded text to interact with surrounding content, so risk of spillover effects
     */
    public static final Unicode LTR_EMBED    = unicode(0x202A);
    /**
     * ditto, but for RTL
     */
    public static final Unicode RTL_EMBED    = unicode(0x202B);
    /**
     * overrides the bidirectional algorithm to display characters in memory order, progressing from left to right
     */
    public static final Unicode LTR_OVERRIDE = unicode(0x202D);
    /**
     * as previous, but display progresses from right to left
     */
    public static final Unicode RTL_OVERRIDE = unicode(0x202E);

    /**
     * Create a unicode instance.
     *
     * @param cp Code point of unicode character, can be BMP or extended.
     * @return The unicode char.
     */
    public static Unicode unicode(int cp) {
        return new Unicode(cp);
    }

    /**
     * Create a unicode instance.
     *
     * @param ch The unicode character (BMP).
     * @return The  unicode char.
     */
    public static Unicode unicode(char ch) {
        return new Unicode(ch);
    }

    private final int cp;

    /**
     * Create a unicode instance.
     *
     * @param cp Character code point.
     */
    public Unicode(int cp) {
        this.cp = cp;
    }

    /**
     * Create a unicode instance.
     *
     * @param ch Unicode character.
     */
    public Unicode(char ch) {
        this.cp = ch;
    }

    @Override
    public int codepoint() {
        return cp;
    }

    @Override
    public String asString() {
        StringBuilder builder = new StringBuilder();
        switch (cp) {
            case NUL:
                return "<NUL>";
            case ABR:
                return "<ABR>";
            case EOF:
                return "<EOF>";
            case ENQ:
                return "<ENQ>";
            case ACK:
                return "<ACK>";
            case NAK:
                return "<NAK>";
            case BEL:
                return "<BEL>"; // Not using '\a', as not universal.
            case BS:
                return "<BS>";  // Not using '\b'. It conflicts with C definition of BEL.
            case VT:
                return "<VT>";
            case SS:
                return "<SS>";
            case SI:
                return "<SI>";
            case ESC:
                return "<ESC>";
            case CAN:
                return "<CAN>";
            case XON:
                return "<XON>";
            case XOFF:
                return "<XOFF>";
            case FS:
                return "<FS>";
            case GS:
                return "<GS>";
            case RS:
                return "<RS>";
            case US:
                return "<US>";
            case DEL:
                return "<DEL>";  // backspace??
            case 0x00A0:
                return "<nbsp>";
            case 0x2066:
                return "<ltr-isolate>";
            case 0x2067:
                return "<rtl-isolate>";
            case 0x2068:
                return "<fs-isolate>";
            case 0x202A:
                return "<ltr-embedded>";
            case 0x202B:
                return "<rtl-embedded>";
            case 0x202D:
                return "<ltr-override>";
            case 0x202E:
                return "<rtl-override>";
            case TAB:
                builder.append('\\').append('t');
                break;
            case LF:
                builder.append('\\').append('n');
                break;
            case FF:
                builder.append('\\').append('f');
                break;
            case CR:
                builder.append('\\').append('r');
                break;
            case '\"':
            case '\'':
            case '\\':
                builder.append('\\').append((char) cp);
                break;
            default:
                if (cp < 0x20) {
                    // Use escaped octal value encoding.
                    builder.append(String.format("\\%03o", cp));
                } else if (isConsolePrintable(cp)) {
                    if (isBmpCodePoint(cp)) {
                        builder.append((char) cp);
                    } else {
                        builder.append(Character.highSurrogate(cp));
                        builder.append(Character.lowSurrogate(cp));
                    }
                } else {
                    if (isBmpCodePoint(cp)) {
                        builder.append(String.format("\\u%04x", cp));
                    } else {
                        builder.append(String.format("\\u%04x", (int) Character.highSurrogate(cp)));
                        builder.append(String.format("\\u%04x", (int) Character.lowSurrogate(cp)));
                    }
                }
                break;
        }
        return builder.toString();
    }

    @Override
    public int printableWidth() {
        if (cp < 0x0300) {
            // Control characters, accents, etc.
            if (cp < 0x20 ||
                (0x7F <= cp && cp < 0xA0)) {
                return 0;
            }
        } else if (cp < 0x0600) {
            if ((cp < 0x0370) || // 0-width punctuation
                (0x0483 <= cp && cp <= 0x0489) || // 0-width symbols
                (0x0591 <= cp && cp <= 0x05c7 &&  // 0-width sanskrit accents
                 cp != 0x05be && cp != 0x05c0 && cp != 0x05c3 && cp != 0x05c6)) {
                return 0;
            }
        } else if (cp < 0x0800) {
            if ((cp <= 0x0605) || // 0-width hebrew accents
                (0x0610 <= cp && cp <= 0x061a) || cp == 0x061c || // 0-width arabic accents
                cp == 0x064b ||
                (0x064c <= cp && cp <= 0x065f) || cp == 0x0670 ||  // 0-width arabic accents (2)
                (0x06d6 <= cp && cp <= 0x06ed && // 0-width symbols (2)
                 cp != 0x06de && cp != 0x06e5 && cp != 0x06e6 && cp != 0x06e9) ||
                cp == 0x070f || cp == 0x0711 ||
                (0x0730 <= cp && cp <= 0x074a) ||
                (0x07a6 <= cp && cp <= 0x07b0) ||
                (0x07eb <= cp && cp <= 0x07f3) ||
                cp == 0x07fd) {
                return 0;
            }
        } else if (cp < 0x1000) {
            if ((0x0816 <= cp && cp <= 0x082d &&
                 cp != 0x081a && cp != 0x0824 && cp != 0x0828) ||
                cp == 0x0859 || cp == 0x085a || cp == 0x085b ||
                cp == 0x0890 || cp == 0x0891 ||
                (0x0898 <= cp && cp <= 0x089f) ||
                (0x08ca <= cp && cp <= 0x0902) ||
                cp == 0x093a || cp == 0x093c ||
                (0x0941 <= cp && cp <= 0x0948) || cp == 0x094d ||
                (0x0951 <= cp && cp <= 0x0957) ||
                cp == 0x0962 || cp == 0x0963 ||
                cp == 0x0981 || cp == 0x09bc ||
                (0x09c1 <= cp && cp <= 0x09c4) ||
                cp == 0x09cd ||
                cp == 0x09e2 || cp == 0x09e3 ||
                cp == 0x09fe ||
                cp == 0x0a01 || cp == 0x0a02 ||
                cp == 0x0a3c ||
                cp == 0x0a41 || cp == 0x0a42 ||
                cp == 0x0a47 || cp == 0x0a48 ||
                cp == 0x0a4b || cp == 0x0a4c || cp == 0x0a4d ||
                cp == 0x0a51 ||
                cp == 0x0a70 || cp == 0x0a71 ||
                cp == 0x0a75 ||
                cp == 0x0a81 || cp == 0x0a82 ||
                cp == 0x0abc ||
                (0x0ac1 <= cp && cp <= 0x0ac8 && cp != 0x0ac6) ||
                cp == 0x0acd ||
                cp == 0x0ae2 || cp == 0x0ae3 ||
                (0x0afa <= cp && cp <= 0x0aff) ||
                cp == 0x0b01 ||
                cp == 0x0b3c ||
                cp == 0x0b3f ||
                cp == 0x0b41 || cp == 0x0b42 || cp == 0x0b43 || cp == 0x0b44 ||
                cp == 0x0b4d ||
                cp == 0x0b55 || cp == 0x0b56 ||
                cp == 0x0b62 || cp == 0x0b63 ||
                cp == 0x0b82 ||
                cp == 0x0bc0 ||
                cp == 0x0bcd ||
                cp == 0x0c00 ||
                cp == 0x0c04 ||
                cp == 0x0c3c ||
                cp == 0x0c3e || cp == 0x0c3f ||
                cp == 0x0c40 ||
                cp == 0x0c46 || cp == 0x0c47 || cp == 0x0c48 ||
                cp == 0x0c4a || cp == 0x0c4b || cp == 0x0c4c || cp == 0x0c4d ||
                cp == 0x0c55 || cp == 0x0c56 ||
                cp == 0x0c62 || cp == 0x0c63 ||
                cp == 0x0c81 ||
                cp == 0x0cbc || cp == 0x0cbf ||
                cp == 0x0cc6 ||
                cp == 0x0ccc || cp == 0x0ccd ||
                cp == 0x0ce2 || cp == 0x0ce3 ||
                cp == 0x0d00 ||
                cp == 0x0d01 ||
                cp == 0x0d3b ||
                cp == 0x0d3c ||
                cp == 0x0d41 || cp == 0x0d42 || cp == 0x0d43 || cp == 0x0d44 ||
                cp == 0x0d4d ||
                cp == 0x0d62 || cp == 0x0d63 ||
                cp == 0x0d81 ||
                cp == 0x0dca ||
                cp == 0x0dd2 || cp == 0x0dd3 ||
                cp == 0x0dd4 || cp == 0x0dd6 ||
                cp == 0x0e31 ||
                (0x0e34 <= cp && cp <= 0x0e3a) ||
                (0x0e47 <= cp && cp <= 0x0e4e) ||
                cp == 0x0eb1 ||
                (0x0eb4 <= cp && cp <= 0x0ebc) ||
                (0x0ec8 <= cp && cp <= 0x0ece) ||
                cp == 0x0f18 || cp == 0x0f19 ||
                cp == 0x0f35 ||
                cp == 0x0f37 ||
                cp == 0x0f39 ||
                (0x0f71 <= cp && cp <= 0x0f87 && cp != 0x0f7f && cp != 0x0f85) ||
                (0x0f8d <= cp && cp <= 0x0fbc && cp != 0x0f98) ||
                cp == 0x0fc6) {
                return 0;
            }
        } else if (cp < 0x2000) {
            if (cp < 0x1360) {
                if (0x1100 <= cp && cp < 0x1160) {
                    return 2;
                } else if ((0x102d <= cp && cp <= 0x1037 && cp != 0x1031) ||
                           cp == 0x1039 ||
                           cp == 0x103a ||
                           cp == 0x103d || cp == 0x103e ||
                           cp == 0x1058 || cp == 0x1059 ||
                           cp == 0x105e || cp == 0x105f ||
                           cp == 0x1060 ||
                           cp == 0x1071 || cp == 0x1072 || cp == 0x1073 || cp == 0x1074 ||
                           cp == 0x1082 ||
                           cp == 0x1085 || cp == 0x1086 ||
                           cp == 0x108d ||
                           cp == 0x109d ||
                           (0x1160 <= cp && cp <= 0x11ff) ||
                           cp == 0x135d || cp == 0x135e || cp == 0x135f) {
                    return 0;
                }
            } else if (0x1700 < cp) {
                if ((0x1712 <= cp && cp <= 0x1714) ||
                    cp == 0x1732 || cp == 0x1733 ||
                    cp == 0x1752 || cp == 0x1753 ||
                    cp == 0x1772 || cp == 0x1773 ||
                    (0x17b4 <= cp && cp <= 0x17bd && cp != 0x17b6) ||
                    cp == 0x17c6 ||
                    (0x17c9 <= cp && cp <= 0x17d3) ||
                    cp == 0x17dd ||
                    (0x180b <= cp && cp <= 0x180f) ||
                    cp == 0x1885 || cp == 0x1886 ||
                    cp == 0x18a9 ||
                    (0x1920 <= cp && cp <= 0x1922) ||
                    cp == 0x1927 || cp == 0x1928 ||
                    cp == 0x1932 ||
                    cp == 0x1939 || cp == 0x193a || cp == 0x193b ||

                    cp == 0x1a17 || cp == 0x1a18 ||
                    cp == 0x1a1b ||
                    cp == 0x1a56 ||
                    (0x1a58 <= cp && cp <= 0x1a5e) ||
                    cp == 0x1a60 ||
                    cp == 0x1a62 ||
                    (0x1a65 <= cp && cp <= 0x1a6c) ||
                    (0x1a73 <= cp && cp <= 0x1a7c) ||
                    cp == 0x1a7f ||
                    (0x1ab0 <= cp && cp <= 0x1ace) ||
                    (0x1b00 <= cp && cp <= 0x1b03) ||
                    cp == 0x1b34 ||
                    (0x1b36 <= cp && cp <= 0x1b3a) ||
                    cp == 0x1b3c ||
                    cp == 0x1b42 ||
                    (0x1b6b <= cp && cp <= 0x1b73) ||
                    cp == 0x1b80 || cp == 0x1b81 ||
                    (0x1ba2 <= cp && cp <= 0x1ba5) ||
                    cp == 0x1ba8 || cp == 0x1ba9 ||
                    cp == 0x1bab || cp == 0x1bac || cp == 0x1bad ||
                    cp == 0x1be6 ||
                    cp == 0x1be8 || cp == 0x1be9 ||
                    cp == 0x1bed ||
                    cp == 0x1bef ||
                    cp == 0x1bf0 || cp == 0x1bf1 ||
                    (0x1c2c <= cp && cp <= 0x1c33) ||
                    cp == 0x1c36 || cp == 0x1c37 ||
                    (0x1cd0 <= cp && cp <= 0x1ce8 &&
                     cp != 0x1cd3 && cp != 0x1ce1) ||
                    cp == 0x1ced ||
                    cp == 0x1cf4 ||
                    cp == 0x1cf8 || cp == 0x1cf9 ||
                    (0x1dc0 <= cp && cp <= 0x1dfb) ||
                    (0x1dfc <= cp && cp <= 0x1dff)) {
                    return 0;
                }
            }
        } else if (cp < 0x3000) {
            if ((0x200b <= cp && cp <= 0x200f) ||
                (0x202a <= cp && cp <= 0x202e) ||
                (0x2060 <= cp && cp <= 0x206f && cp != 0x2065) ||
                (0x20d0 <= cp && cp <= 0x20ef) ||
                cp == 0x20f0 ||
                cp == 0x2d7f ||
                cp == 0x2cef || cp == 0x2cf0 || cp == 0x2cf1 ||
                (0x2de0 <= cp && cp <= 0x2dff)) {
                return 0;
            } else if ((0x231a <= cp && cp < 0x231c) ||
                       (0x2329 <= cp && cp < 0x232b) ||
                       (0x23e9 <= cp && cp < 0x23ed) ||
                       cp == 0x23f0 ||
                       cp == 0x23f3 ||
                       (0x25fd <= cp && cp < 0x25ff) ||
                       (0x2614 <= cp && cp < 0x2616) ||
                       (0x2648 <= cp && cp <= 0x2653) ||
                       cp == 0x267f ||
                       cp == 0x2693 ||
                       cp == 0x26a1 ||
                       cp == 0x26aa || cp == 0x26ab ||
                       (0x26bd <= cp && cp <= 0x26be) ||
                       cp == 0x26c4 || cp == 0x26c5 ||
                       cp == 0x26ce ||
                       cp == 0x26d4 ||
                       cp == 0x26ea ||
                       cp == 0x26f2 || cp == 0x26f3 ||
                       cp == 0x26f5 || cp == 0x26fa || cp == 0x26fd ||
                       cp == 0x2705 || cp == 0x270a || cp == 0x270b ||
                       cp == 0x2728 ||
                       cp == 0x274c ||
                       cp == 0x274e ||
                       (0x2753 <= cp && cp <= 0x2755) || cp == 0x2757 ||
                       (0x2795 <= cp && cp <= 0x2797) ||
                       cp == 0x27b0 || cp == 0x27bf ||
                       cp == 0x2b1b || cp == 0x2b1c ||
                       cp == 0x2b50 || cp == 0x2b55 ||
                       (0x2e80 <= cp && cp < 0x2fd6) ||
                       (0x2ff0 <= cp && cp < 0x2ffc)) {
                if (!(cp == 0x2e9a ||
                      (0x2ef4 <= cp && cp <= 0x2eff))) {
                    return 2;
                }
            }
        } else if (cp <= 0x10000) {
            // Control characters, accents, etc.
            if (cp < 0x3030) {
                if (cp == 0x302a ||
                    cp == 0x302b ||
                    cp == 0x302c ||
                    cp == 0x302d) {
                    return 0;
                }
            } else if (0xa660 < cp) {
                if ((0xa66f <= cp && cp <= 0xa67d &&
                     cp != 0xa673) ||
                    cp == 0xa6f0 || cp == 0xa6f1 ||
                    cp == 0xa69e || cp == 0xa69f ||
                    cp == 0xa802 ||
                    cp == 0xa806 ||
                    cp == 0xa80b ||
                    cp == 0xa825 || cp == 0xa826 ||
                    cp == 0xa8c4 || cp == 0xa8c5 ||
                    (0xa8e0 <= cp && cp <= 0xa8f1) ||
                    cp == 0xa8ff ||
                    (0xa926 <= cp && cp <= 0xa92d) ||
                    (0xa947 <= cp && cp <= 0xa951) ||
                    (0xa980 <= cp && cp <= 0xa982) ||
                    cp == 0xa9b3 ||
                    (0xa9b6 <= cp && cp <= 0xa9b9) ||
                    cp == 0xa9bc || cp == 0xa9bd ||
                    cp == 0xa9e5 ||
                    (0xaa29 <= cp && cp <= 0xaa2e) ||
                    cp == 0xaa31 || cp == 0xaa32 ||
                    cp == 0xaa35 || cp == 0xaa36 ||
                    cp == 0xaa43 ||
                    cp == 0xaa4c ||
                    cp == 0xaa7c ||
                    cp == 0xaab0 ||
                    cp == 0xaab2 || cp == 0xaab3 || cp == 0xaab4 ||
                    cp == 0xaab7 || cp == 0xaab8 ||
                    cp == 0xaaec || cp == 0xaaed || cp == 0xaabe || cp == 0xaabf ||
                    cp == 0xaac1 ||
                    cp == 0xaaf6 ||
                    cp == 0xabe5 ||
                    cp == 0xabe8 ||
                    cp == 0xabed ||
                    (0xd7b0 <= cp && cp <= 0xd7ff) ||
                    // -- 0xd86f ... ud8a2 prints as if it was width 0, but setting
                    // it here has apparently no effect.
                    // (0xd86f <= cp && cp <= 0xd8a2) ||
                    (0xfe00 <= cp && cp <= 0xfe0f) ||
                    (0xfe20 <= cp && cp <= 0xfe2f) ||
                    cp == 0xfeff ||
                    (0xfff9 <= cp && cp <= 0xfffb)) {
                    return 0;
                }
            }
            // CJK compatibility (square symbols), Extension A
            if ((cp < 0x4dc0)) {
                if (cp == 0x3097 ||
                    cp == 0x3099 ||
                    cp == 0x309a) {
                    return 0;
                }
                if (!(
                        cp == 0x3040 ||
                        (0x3097 <= cp && cp <= 0x309a) ||
                        (0x3100 <= cp && cp <= 0x3104) ||
                        cp == 0x303f ||
                        cp == 0x3130 ||
                        cp == 0x318f ||
                        (0x31e4 <= cp && cp <= 0x31ef) ||
                        cp == 0x321f ||
                        (0x3248 <= cp && cp <= 0x324f))) {
                    return 2;
                }
            } else if (
                    (0x4e00 <= cp && cp < 0xa4c7) ||  // CJK Main group of ideographs ↓ ↓
                    (0xa960 <= cp && cp < 0xa97d) ||
                    (0xac00 <= cp && cp < 0xd7a4) ||
                    (0xf900 <= cp && cp < 0xfb00) ||
                    (0xfe10 <= cp && cp < 0xfe1a) ||
                    (0xfe30 <= cp && cp < 0xfe6c) ||
                    (0xff01 <= cp && cp < 0xff61) ||
                    (0xffe0 <= cp && cp < 0xffe7)) {
                // Some excluded chars and ranges.
                if (!(
                        (0xa48d <= cp && cp <= 0xa48f) ||
                        cp == 0xfe53 ||
                        cp == 0xfe67)) {
                    // CJK or other double-width character.
                    return 2;
                }
            }
        } else {
            // Character.isIdeographic(cp) does not mean the same thing as 'double width'.
            if ((0x00020000 <= cp && cp < 0x0002A6C0) ||  // CJK Extension B
                (0x0002A700 <= cp && cp < 0x0002CEB0)) {  // CJK Extension C, D, E
                return 2;
            }
        }

        return 1;
    }

    @Override
    public int length() {
        if (!isBmpCodePoint(cp)) {
            return 2;
        }
        return 1;
    }

    @Override
    public boolean equals(Object o) {
        if (o == this) {
            return true;
        }
        if (o == null || !(getClass().equals(o.getClass()))) {
            return false;
        }

        return cp == ((Unicode) o).cp;
    }

    @Override
    public String toString() {
        if (!isBmpCodePoint(cp)) {
            return new String(new char[]{
                    Character.highSurrogate(cp),
                    Character.lowSurrogate(cp)
            });
        }
        return String.valueOf((char) cp);
    }

    @Override
    public int hashCode() {
        return Objects.hash(Unicode.class, cp);
    }

    @Override
    public int compareTo(Char o) {
        return Integer.compare(cp, o.codepoint());
    }
}