T61Charset.java

/*
 * Copyright (c) 2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings.enc;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.Normalizer;

import static java.text.Normalizer.Form.NFKC;

/**
 * The T.61 charset, a.k.a. Teletex is an ITU-T encoding standard used some
 * places in the telco world, though has been phased out in places in favor of
 * ASCII (called IA5, or string encoding type 5) and UTF-8. This it is still
 * part of ASN.1 encoding standard, so still in use some places (mainly legacy
 * systems).
 * <p>
 * The encoding set contains a subset of US-ASCII characters, plus a number of
 * selected diacritics and special characters used in major latin based
 * languages.
 * <p>
 * - <a href="https://en.wikipedia.org/wiki/ITU_T.61">Wikipedia on T.61</a><br>
 * - <a href="https://en.wikipedia.org/wiki/Teletex">Wikipedia on Teletex</a>
 */
public class T61Charset extends Charset {
    /** The T.61 charset instance. */
    public static final Charset T61 = new T61Charset();

    private static final String MAPPING =
            "\0\0\0\0\0\0\0\0" + "\b\0\n\0\f\r\016\017" +
            "\0\0\0\0\0\0\0\0" + "\0\u008e\u001a\u001b\0\u008f\0\0" +
            " !\"\0\0%&'" + "()*+,-./" +
            "01234567" + "89:;<=>?" +
            "@ABCDEFG" + "HIJKLMNO" +
            "PQRSTUVW" + "XYZ[\0]\0_" +
            "\0abcdefg" + "hijklmno" +
            "pqrstuvw" + "xyz\0|\0\0\177" +
            "\0\0\0\0\0\0\0\0" + "\0\0\0\u008b\u008c\0\0\0" +
            "\0\0\0\0\0\0\0\0" + "\0\0\0\u009b\0\0\0\0" +
            "\u00A0¡¢£$¥#§" + "¤\0\0«\0\0\0\0" +
            "°±²³×µ¶·" + "÷\0\0»¼½¾¿" +
            "\0\u0300\u0301\u0302\u0303\u0304\u0306\u0307" + "\u0308\u0308\u030a\u0327\u0332\u030b\u0328\u030c" +
            "\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0" +
            "ΩÆÐªĦ\0ĲĿ" + "ŁØŒºÞŦŊŉ" +
            "ĸæđðħıĳŀ" + "łøœßþŧŋ\0";

    private T61Charset() {
        super("T.61", new String[]{"IBM-01036", "CP1036"});
    }

    @Override
    public boolean contains(Charset charset) {
        // This is not QUITE a subset of US-ASCII, and US-ASCII is not QUITE
        // a subset of this.
        return charset instanceof T61Charset;
    }

    @Override
    public CharsetDecoder newDecoder() {
        return new Decoder();
    }

    @Override
    public CharsetEncoder newEncoder() {
        return new Encoder();
    }

    static class Decoder extends CharsetDecoder {
        protected Decoder() {
            super(T61, 1, 1);
        }

        @Override
        protected CoderResult decodeLoop(ByteBuffer byteBuffer, CharBuffer charBuffer) {
            while (byteBuffer.hasRemaining()) {
                if (!charBuffer.hasRemaining()) {
                    return CoderResult.OVERFLOW;
                }

                byte b = byteBuffer.get();
                char c = charAt(b);
                if (c == '\0') {
                    byteBuffer.position(byteBuffer.position() - 1);
                    return CoderResult.unmappableForLength(1);
                }
                // T.61 diacritics come *before* the letter, not after as in UCS.
                if (c >= '\u0300' && c <= '\u0340') {
                    if (byteBuffer.hasRemaining()) {
                        byte nb = byteBuffer.get();
                        char nc = charAt(nb);
                        String tmp = Normalizer.normalize(new String(new char[]{nc, c}), NFKC);
                        charBuffer.put(tmp);
                        continue;
                    } else {
                        byteBuffer.position(byteBuffer.position() - 1);
                        return CoderResult.unmappableForLength(1);
                    }
                }
                charBuffer.put(c);
            }
            return CoderResult.UNDERFLOW;
        }
    }

    static class Encoder extends CharsetEncoder {
        protected Encoder() {
            super(T61, 1f, 2f, new byte[]{'?'});
        }

        @Override
        public boolean canEncode(char c) {
            return indexOf(c) >= 0;
        }

        @Override
        public boolean canEncode(CharSequence cs) {
            return cs.chars().allMatch(c -> canEncode((char) c));
        }

        @Override
        protected CoderResult encodeLoop(CharBuffer charBuffer, ByteBuffer byteBuffer) {
            while (charBuffer.hasRemaining()) {
                char c = charBuffer.get();
                if (Character.isHighSurrogate(c)) {
                    // No surrogate pairs are supported.
                    if (charBuffer.hasRemaining()) {
                        char d = charBuffer.get();
                        if (Character.isLowSurrogate(d)) {
                            charBuffer.position(charBuffer.position() - 2);
                            return CoderResult.unmappableForLength(2);
                        }
                        charBuffer.position(charBuffer.position() - 2);
                    } else {
                        charBuffer.position(charBuffer.position() - 1);
                    }
                    return CoderResult.unmappableForLength(1);
                } else if (Character.isLowSurrogate(c)) {
                    charBuffer.position(charBuffer.position() - 1);
                    return CoderResult.unmappableForLength(1);
                }

                String expanded = Normalizer.normalize(String.valueOf(c), Normalizer.Form.NFKD);
                if (expanded.length() > byteBuffer.remaining()) {
                    charBuffer.position(charBuffer.position() - 1);
                    return CoderResult.OVERFLOW;
                }

                for (int i = 0; i < expanded.length(); ++i) {
                    char cc = expanded.charAt(i);
                    int ci = indexOf(cc);
                    if (cc == 0 || ci < 0) {
                        charBuffer.position(charBuffer.position() - 1);
                        return CoderResult.unmappableForLength(1);
                    }
                    if (byteBuffer.position() > 0 && cc >= '\u0300' && cc <= '\u0340') {
                        // we must swap the order of char and matching diacritic.
                        byte lb = byteBuffer.get(byteBuffer.position() - 1);
                        if (lb > 0) {
                            byteBuffer.position(byteBuffer.position() - 1);
                            byteBuffer.put((byte) ci).put(lb);
                            continue;
                        }
                    }
                    byteBuffer.put((byte) ci);
                }
            }
            return CoderResult.UNDERFLOW;
        }
    }

    static int indexOf(char c) {
        if (c < 0x100 && MAPPING.charAt(c) == c) return c;
        return MAPPING.indexOf(c);
    }

    static char charAt(int i) {
        if (i < 0) i = 0x100 + i;
        return MAPPING.charAt(i);
    }
}