GSMCharset.java

/*
 * Copyright (c) 2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings.enc;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.Normalizer;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;

import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Bengali;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Default;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Gujarati;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Hindi;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Kannada;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Malayalam;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Oriya;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Portuguese;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Punjabi;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Spanish;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Tamil;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Telugu;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Turkish;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Urdu;
import static net.morimekta.strings.internal.GSMCharsetUtil.EXT_CODE;

/**
 * The GSM Charset is actually a set of charsets that uses the
 * same rules for how to encode, but with different character
 * tables based on preset language settings.
 * <p>
 * Also known as <b>GSM 03.38</b> or <b>3GPP 23.038</b>.
 * <p>
 * See <a href="https://en.wikipedia.org/wiki/GSM_03.38#GSM_7-bit_default_alphabet_and_extension_table_of_3GPP_TS_23.038_/_GSM_03.38">GSM 03.38</a> on Wikipedia.<br>
 * See <a href="https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/17.00.00_60/ts_123038v170000p.pdf">3GPP TS 23.038 v 17.0</a> on ETSI.org.<br>
 */
public class GSMCharset extends Charset {
    /**
     * The UCS2 charset. Similar to UTF-16, but originally did not support
     * surrogate pairs, and is always big endian (no endian marker on encoded
     * stream).
     */
    public static final Charset         UCS2           = Charset.forName("ISO-10646-UCS-2");
    /**
     * The Default GSM charset (standard western NLI)
     */
    public static final GSMCharset      GSM            = new GSMCharset();
    /**
     * The Turkish GSM charset
     */
    public static final GSMCharset      GSM_Turkish    = new GSMCharset(Turkish, Turkish);
    /**
     * The Spanish GSM charset
     */
    public static final GSMCharset      GSM_Spanish    = new GSMCharset(Default, Spanish);
    /**
     * The Portuguese GSM charset
     */
    public static final GSMCharset      GSM_Portuguese = new GSMCharset(Portuguese, Portuguese);
    /**
     * The Bengali GSM charset
     */
    public static final GSMCharset      GSM_Bengali    = new GSMCharset(Bengali, Bengali);
    /**
     * The Gujarati GSM charset
     */
    public static final GSMCharset      GSM_Gujarati   = new GSMCharset(Gujarati, Gujarati);
    /**
     * The Hindi GSM charset
     */
    public static final GSMCharset      GSM_Hindi      = new GSMCharset(Hindi, Hindi);
    /**
     * The Kannada GSM charset
     */
    public static final GSMCharset      GSM_Kannada    = new GSMCharset(Kannada, Kannada);
    /**
     * The Malayalam GSM charset
     */
    public static final GSMCharset      GSM_Malayalam  = new GSMCharset(Malayalam, Malayalam);
    /**
     * The Oriya GSM charset
     */
    public static final GSMCharset      GSM_Oriya      = new GSMCharset(Oriya, Oriya);
    /**
     * The Punjabi GSM charset
     */
    public static final GSMCharset      GSM_Punjabi    = new GSMCharset(Punjabi, Punjabi);
    /**
     * The Tamil GSM charset
     */
    public static final GSMCharset      GSM_Tamil      = new GSMCharset(Tamil, Tamil);
    /**
     * The Telugu GSM charset
     */
    public static final GSMCharset      GSM_Telugu     = new GSMCharset(Telugu, Telugu);
    /**
     * The Urdu GSM charset
     */
    public static final GSMCharset      GSM_Urdu       = new GSMCharset(Urdu, Urdu);
    /**
     * Set of all GSM charsets default NLI combinations.
     */
    public static final Set<GSMCharset> GSM_CHARSETS   = Set.of(
            GSM, GSM_Turkish, GSM_Spanish, GSM_Portuguese,
            GSM_Bengali, GSM_Gujarati, GSM_Hindi, GSM_Kannada, GSM_Malayalam,
            GSM_Oriya, GSM_Punjabi, GSM_Tamil, GSM_Telugu, GSM_Urdu);

    private final GSMNationalLanguageIdentifier lockingShift;
    private final GSMNationalLanguageIdentifier singleShift;

    /**
     * Detect best fitting GSM charset for the text. It will match
     * the text to the best pair of same-locale static and single
     * shift language identifiers, so not necessary the true optimal.
     * Will fall back to UCS2 (UTF16-BE) if no GSM charset available.
     *
     * @param text The text to detect charset for.
     * @return The detected charset.
     */
    public static Charset detectCharset(String text) {
        return detectCharset(text, GSM_CHARSETS);
    }

    /**
     * Detect best fitting GSM charset for the text. It will match
     * the text to the best pair of same-locale static and single
     * shift language identifiers, so not necessary the true optimal.
     * Will fall back to UCS2 (UTF16-BE) if no GSM charset available.
     *
     * @param text The text to detect charset for.
     * @param alt  Alternative GSM charsets to check.
     * @return The detected charset.
     */
    public static Charset detectCharset(String text, Set<GSMCharset> alt) {
        requireNonNull(text, "text == null");
        requireNonNull(alt, "alt == null");
        if (text.length() == 0) {
            return GSM;
        }
        Set<GSMCharset> available = new HashSet<>(alt);
        available.removeIf(cs -> !cs.canEncode(text));
        if (available.contains(GSM)) {
            return GSM;
        }
        if (available.isEmpty()) {
            return UCS2;
        }
        return available.stream()
                        .min(Comparator.comparingInt(GSMCharset::modifications))
                        .get();
    }

    /**
     * Get GSM charset for the matching locales.
     *
     * @param lockingLocale The main locking locale.
     * @param extLocale     The ext (escape) locale.
     * @return The charset.
     */
    public static GSMCharset forNationalLanguageIdentifier(GSMNationalLanguageIdentifier lockingLocale,
                                                           GSMNationalLanguageIdentifier extLocale) {
        requireNonNull(lockingLocale, "lockingLocale == null");
        requireNonNull(extLocale, "extLocale == null");
        if (lockingLocale == Default && extLocale == Default) {
            return GSM;
        }
        for (GSMCharset charset : GSM_CHARSETS) {
            if (charset.lockingShift == lockingLocale && charset.singleShift == extLocale) {
                return charset;
            }
        }
        return new GSMCharset(lockingLocale, extLocale);
    }

    private GSMCharset() {
        this(Default, Default);
    }

    private GSMCharset(GSMNationalLanguageIdentifier lockingLocale,
                       GSMNationalLanguageIdentifier extLocale) {
        super(makeCharsetCanonicalName(lockingLocale, extLocale),
              makeCharsetAliasNames(lockingLocale, extLocale));
        requireNonNull(lockingLocale.basic, "No basic character table for " + lockingLocale.name());
        this.lockingShift = lockingLocale;
        this.singleShift = extLocale;
    }

    /**
     * Get the locking language identifier.
     *
     * @return Get the locking language identifier, indicating the main charset.
     */
    public GSMNationalLanguageIdentifier getLockingShift() {
        return lockingShift;
    }

    /**
     * Get the single shift language identifier.
     *
     * @return Get the single shift language identifier, indicating the extended charset.
     */
    public GSMNationalLanguageIdentifier getSingleShift() {
        return singleShift;
    }

    /**
     * Get the number of modifications relative to full default. Useful for finding the simplest
     * charset compatible with a string. See {@link GSMCharset#detectCharset(String)} for details.
     *
     * @return The number of modification points from default.
     */
    public int modifications() {
        return (lockingShift == Default ? 0 : 1) +
               (singleShift == Default ? 0 : 2);
    }

    /**
     * If the charset combination can encode the given single character.
     *
     * @param c The char to check.
     * @return If the char can be encoded.
     */
    public boolean canEncode(char c) {
        if (c <= EXT_CODE) return false;
        return lockingShift.basicMap.containsKey(c) ||
               singleShift.shiftMap.containsKey(c);
    }

    /**
     * Check if this charset can encode the provided char sequence.
     *
     * @param cs The string to check.
     * @return If the string can be fully encoded with the charset.
     */
    public boolean canEncode(CharSequence cs) {
        for (int i = 0; i < cs.length(); ++i) {
            char c = cs.charAt(i);
            if (c <= EXT_CODE) return false;
            if (lockingShift.basicExtended.contains(c) && i < (cs.length() - 1)) {
                char n = cs.charAt(i + 1);
                if (lockingShift.basicStringCode.containsKey("" + c + n)) {
                    ++i;
                    continue;
                }
            }
            if (singleShift.shiftExtended.contains(c) && i < (cs.length() - 1)) {
                char n = cs.charAt(i + 1);
                if (singleShift.shiftStringCode.containsKey("" + c + n)) {
                    ++i;
                    continue;
                }
            }
            if (lockingShift.basicMap.containsKey(c) ||
                singleShift.shiftMap.containsKey(c)) {
                continue;
            }
            return false;
        }
        return true;
    }

    /**
     * Normalize the text into the closest string to what can be encoded without errors by the
     * charset.
     *
     * @param text        The string to normalize.
     * @param replacement The replacement character if unable to normalize to a known partial
     *                    replacement of an unmappable char.
     * @return The string with unmappable characters replaced with the closest equivalent if
     * possible, or skipped if unable to normalize.
     */
    public String normalize(CharSequence text, char replacement) {
        if (!canEncode(replacement)) {
            throw new IllegalArgumentException("Unable to encode replacement");
        }
        if (canEncode(text)) {
            return text.toString();
        }

        ByteBuffer rb = encode(String.valueOf(replacement));
        var normalized = Normalizer.normalize(text, Normalizer.Form.NFKC);
        ByteBuffer bb = ByteBuffer.allocate(Math.max(128, normalized.length()));
        CharBuffer cb = CharBuffer.wrap(normalized);
        CharsetEncoder encoder = newEncoder();
        CoderResult result;
        while (!(result = encoder.encode(cb, bb, false)).isUnderflow()) {
            if (result.isUnmappable()) {
                var c = cb.charAt(0);
                if (!isLowSurrogate(c) && !isHighSurrogate(c)) {
                    var decomposed = Normalizer.normalize(String.valueOf(c), Normalizer.Form.NFKD);
                    boolean replaced = false;
                    for (var d : decomposed.toCharArray()) {
                        if (canEncode(d)) {
                            encoder.encode(CharBuffer.wrap(String.valueOf(d)), bb, false);
                            replaced = true;
                        }
                    }
                    if (!replaced) {
                        bb.put(rb);
                        rb.flip();
                    }
                    // and skip the
                } else {
                    bb.put(rb);
                    rb.flip();
                }
                cb.position(cb.position() + result.length());
            } else if (result.isOverflow()) {
                bb = ByteBuffer.allocate(bb.capacity() * 2).put(bb.flip());
            }
        }
        var buffer = new byte[bb.position()];
        bb.flip();
        bb.get(buffer);
        return new String(buffer, this);
    }

    @Override
    public boolean contains(Charset charset) {
        if (charset instanceof GSMCharset) {
            GSMCharset other = (GSMCharset) charset;
            return other.lockingShift == lockingShift &&
                   other.singleShift == singleShift;
        }
        return false;
    }

    @Override
    public CharsetDecoder newDecoder() {
        return new GSMCharsetDecoder(this);
    }

    @Override
    public CharsetEncoder newEncoder() {
        return new GSMCharsetEncoder(this);
    }

    private static String makeCharsetCanonicalName(
            GSMNationalLanguageIdentifier lockingLocale,
            GSMNationalLanguageIdentifier extLocale) {
        return "3GPP-23.038" +
               (requireNonNull(lockingLocale, "lockingLocale == null") == Default ? "" : "-" + lockingLocale.iso639_1) +
               (requireNonNull(extLocale, "extLocale == null") == Default ? "" : "+" + extLocale.iso639_1);
    }

    private static String[] makeCharsetAliasNames(
            GSMNationalLanguageIdentifier lockingLocale,
            GSMNationalLanguageIdentifier extLocale) {
        return extLocale.iso639_1 != null
               && (lockingLocale == extLocale || (lockingLocale == Default && extLocale == Spanish))
               ? new String[]{"GSM-" + extLocale.iso639_1}
               : lockingLocale == Default && extLocale == Default
                 ? new String[]{"GSM"}
                 : new String[0];
    }
}