GSMCharsetUtil.java

/*
 * Copyright (c) 2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings.internal;

import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

public final class GSMCharsetUtil {
    public static final String DEFAULT_BC;
    public static final String DEFAULT_EX;
    public static final String TURKISH_LS;
    public static final String TURKISH_SS;
    public static final String SPANISH_SS;
    public static final String PORTUGUESE_LS;
    public static final String PORTUGUESE_SS;
    public static final String BENGALI_LS;
    public static final String BENGALI_SS;
    public static final Map<Byte, String> BENGALI_SX;
    public static final String GUJARATI_LS;
    public static final String GUJARATI_SS;
    public static final String HINDI_LS;
    public static final String HINDI_SS;
    public static final Map<Byte, String> HINDI_SX;
    public static final String KANNADA_LS;
    public static final String KANNADA_SS;
    public static final String MALAYALAM_LS;
    public static final String MALAYALAM_SS;
    public static final String ORIYA_LS;
    public static final String ORIYA_SS;
    public static final Map<Byte, String> ORIYA_SX;
    public static final String PUNJABI_LS;
    public static final String PUNJABI_SS;
    public static final Map<Byte, String> PUNJABI_LX;
    public static final Map<Byte, String> PUNJABI_SX;
    public static final String TAMIL_LS;
    public static final String TAMIL_SS;
    public static final String TELUGU_LS;
    public static final String TELUGU_SS;
    public static final String URDU_LS;
    public static final String URDU_SS;

    public static final byte SHIFT_SEPTET = 0x1b;
    public static final char EXT_CODE = '\2';

    static {
        // The basic set only uses 7 bits (a septet) for each
        // character. If the 'esc' char is encountered, the next
        // char (only) is taken from the basic extension table.
        DEFAULT_BC = join(
                "@£$¥èéùìòÇ\nØø\rÅå",
                "Δ_ΦΓΛΩΠΨΣΘΞ\1ÆæßÉ",
                " !\"#¤%&'()*+,-./",
                "0123456789:;<=>?",
                "¡ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZÄÖÑܧ",
                "¿abcdefghijklmno",
                "pqrstuvwxyzäöñüà");
        DEFAULT_EX = join(
                "\0\0\0\0\0\0\0\0\0\0\f\0\0\0\0\0",
                "\0\0\0\0^\0\0\0\0\0\0\1\0\0\0\0",
                "\0\0\0\0\0\0\0\0{}\0\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");

        TURKISH_LS = join(
                "@£$¥€éùıòÇ\nĞğ\rÅå",
                "Δ_ΦΓΛΩΠΨΣΘΞ\1ŞşßÉ",
                " !\"#¤%&'()*+,-./",
                "0123456789:;<=>?",
                "İABCDEFGHIJKLMNO",
                "PQRSTUVWXYZÄÖÑܧ",
                "¿abcdefghijklmno",
                "pqrstuvwxyzäöñüà");
        TURKISH_SS = join(
                "\0\0\0\0\0\0\0\0\0\0\f\0\0\b\0\0",
                "\0\0\0\0^\0\0\0\0\0\0\1\0\0\0\0",
                "\0\0\0\0\0\0\0\0{}\0\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|\0\0\0\0\0\0Ğ\0İ\0\0\0\0\0\0",
                "\0\0\0Ş\0\0\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0ç\0€\0ğ\0ı\0\0\0\0\0\0",
                "\0\0\0ş\0\0\0\0\0\0\0\0\0\0\0\0");
        SPANISH_SS = join(
                "\0\0\0\0\0\0\0\0\0ç\f\0\0\0\0\0",
                "\0\0\0\0^\0\0\0\0\0\0\1\0\0\0\0",
                "\0\0\0\0\0\0\0\0{}\0\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|Á\0\0\0\0\0\0\0Í\0\0\0\0\0Ó",
                "\0\0\0\0\0Ú\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0í\0\0\0\0\0ó",
                "\0\0\0\0\0ú\0\0\0\0\0\0\0\0\0\0");
        PORTUGUESE_LS = join(
                "@£$¥êéúíóÇ\nÔô\rÁá",
                "Δ_ªÇÀ∞^\\€Ó|\1ÂâÊê",
                " !\"#º%&'()*+,-./",
                "0123456789:;<=>?",
                "ÍABCDEFGHIJKLMNO",
                "PQRSTUVWXYZÃÕÚܧ",
                "~abcdefghijklmno",
                "pqrstuvwxyzãõ`üà");
        PORTUGUESE_SS = join(
                "\0\0\0\0\0ê\0\0\0ç\fÔô\bÁá",
                "\0\0ΦΓ^ΩΠΨΣΘ\0\1\0\0\0Ê",
                "\0\0\0\0\0\0\0\0{}\0\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|À\0\0\0\0\0\0\0Í\0\0\0\0\0Ó",
                "\0\0\0\0\0Ú\0\0\0\0\0ÃÕ\0\0\0",
                "\0Â\0\0\0€\0\0\0í\0\0\0\0\0ó",
                "\0\0\0\0\0ú\0\0\0\0\0ãõ\0\0â");
        BENGALI_LS = join(
                "ঁংঃঅআইঈউঊঋ\nঌ\0\r\0এ",
                "ঐ\0\0ওঔকখগঘঙচ\1ছজঝঞ",
                " !টঠডঢণত)(থদ,ধ.ন",
                "0123456789:;\0পফ?",
                "বভমযর\0ল\0\0\0শষসহ়ঽ",
                "ািীুূৃৄ\0\0েৈ\0\0োৌ্",
                "ৎabcdefghijklmno",
                "pqrstuvwxyzৗডঢৰৱ");
        BENGALI_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*০১\1২৩৪৫",
                "৬৭৮৯\2ৠৡৢ{}ৣ৲৳৴৵\\",
                "৶৷৸৹৺\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        BENGALI_SX = Map.of((byte) 0x24, "য়");
        GUJARATI_LS = join(
                "ઁંઃઅઆઇઈઉઊઋ\nઌઍ\r\0એ",
                "ઐઑ\0ઓઔકખગઘઙચ\1છજઝઞ",
                " !ટઠડઢણત)(થદ,ધ.ન",
                "0123456789:;\0પફ?",
                "બભમયર\0લળ\0વશષસહ઼ઽ",
                "ાિીુૂૃૄૅ\0ેૈૉ\0ોૌ્",
                "ૐabcdefghijklmno",
                "pqrstuvwxyzૠૡૢૣ૱");
        GUJARATI_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1૦૧૨૩",
                "૪૫૬૭૮૯\0\0{}\0\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        HINDI_LS = join(
                "ँंःअआइईउऊऋ\nऌऍ\rऎए",
                "ऐऑऒओऔकखगघङच\1छजझञ",
                " !टठडढणत)(थद,ध.न",
                "0123456789:;ऩपफ?",
                "बभमयरऱलळऴवशषसह़ऽ",
                "ािीुूृॄॅॆेैॉॊोौ्",
                "ॐabcdefghijklmno",
                "pqrstuvwxyzॲॻॼॾॿ");
        HINDI_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1०१२३",
                "४५६७८९॒॑{}॓॔\2\2\2\\",
                "\2\2\2\2\2ॠॡॢॣ॰ॱ\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        HINDI_SX = Map.of(
                (byte) 0x2c, "क़",
                (byte) 0x2d, "ख़",
                (byte) 0x2e, "ग़",
                (byte) 0x30, "ज़",
                (byte) 0x31, "ड़",
                (byte) 0x32, "ढ़",
                (byte) 0x33, "फ़",
                (byte) 0x34, "य़"
        );
        KANNADA_LS = join(
                "\0ಂಃಅಆಇಈಉಊಋ\nಌ\0\rಎಏ",
                "ಐ\0ಒಓಔಕಖಗಘಙಚ\1ಛಜಝಞ",
                " !ಟಠಪಢಣತ)(ಥದ,ಧ.ನ",
                "0123456789:;\0ಪಫ?",
                "ಬಭಮಯರಱಲಳ\0ವಶಷಸಹ಼ಽ",
                "ಾಿೀುೂೃೄ\0ೆೇೈ\0ೊೋೌ್",
                "ೕabcdefghijklmno",
                "pqrstuvwxyzೖೠೡೢೣ");
        KANNADA_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1೦೧೨೩",
                "೪೫೬೭೮೯ೞೱ{}ೲ\0\0\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        MALAYALAM_LS = join(
                "\0ംഃഅആഇഈഉഊഋ\nഌ\0\rഎഏ",
                "ഐ\0ഒഓഔകഖഗഘങച\1ഛജഝഞ",
                " !ടഠഡഢണത)(ഥദ,ധ.ന",
                "0123456789:;\0പഫ?",
                "ബഭമയരറലളഴവശഷസഹ\0ഽ",
                "ാിീുൂൃൄ\0െേൈ\0ൊോൌ്",
                "ൗabcdefghijklmno",
                "pqrstuvwxyzൠൡൢൣ൹");
        MALAYALAM_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1൦൧൨൩",
                "൪൫൬൭൮൯൰൱{}൲൳൴൵ൺ\\",
                "ൻർൽൾൿ\0\0\0\0\0\0\0[~]\0",
                "-ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        ORIYA_LS = join(
                "ଁଂଃଅଆଇଈଉଊଋ\nଌ\0\r\0ଏ",
                "ଐ\0\0ଓଔକଖଗଘଙଚ\1ଛଜଝଞ",
                " !ଟଠଡଢଣତ)(ଥଦ,ଧ.ନ",
                "0123456789:;\0ପଫ?",
                "ବଭମଯର\0ଲଳ\0ଵଶଷସହ଼ଽ",
                "ାିୀୁୂୃୄ\0\0େୈ\0\0ୋୌ୍",
                "ୖabcdefghijklmno",
                "pqrstuvwxyzୗୠୡୢୣ");
        ORIYA_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1୦୧୨୩",
                "୪୫୬୭୮୯\2\2{}ୟ୰ୱ\0\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        ORIYA_SX = Map.of(
                (byte) 0x26, "ଡ଼",
                (byte) 0x27, "ଢ଼"
        );
        PUNJABI_LS = join(
                "ਁਂਃਅਆਇਈਉਊ\0\n\0\0\r\0ਏ",
                "ਐ\0\0ਓਔਕਖਗਘਙਚ\1ਛਜਝਞ",
                " !ਟਠਡਢਣਤ)(ਥਦ,ਧ.ਨ",
                "0123456789:;\0ਪਫ?",
                "ਬਭਮਯਰ\0ਲ\2\0ਵ\2\0ਸਹ਼\0",
                "ਾਿੀੁੂ\0\0\0\0ੇੈ\0\0ੋੌ੍",
                "ੑabcdefghijklmno",
                "pqrstuvwxyzੰੱੲੳੴ");
        PUNJABI_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1੦੧੨੩",
                "੪੫੬੭੮੯\2\2{}\2ੜ\2ੵ\0\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        PUNJABI_LX = Map.of(
                (byte) 0x47, "ਲ਼",
                (byte) 0x4a, "ਸ਼"
        );
        PUNJABI_SX = Map.of(
                (byte) 0x26, "ਖ਼",
                (byte) 0x27, "ਗ਼",
                (byte) 0x2a, "ਜ਼",
                (byte) 0x2c, "ਫ਼"
        );
        TAMIL_LS = join(
                "\0ஂஃஅஆஇஈஉஊ\0\n\0\0\rஎஏ",
                "ஐ\0ஒஓஔக\0\0\0ஙச\1\0ஜ\0ஞ",
                " !ட\0\0\0ணத)(\0\0,\0.ந",
                "0123456789:;னப\0?",
                "\0\0மயரறலளழவஶஷஸஹ\0\0",
                "ாிீுூ\0\0\0ெேை\0ொோௌ்",
                "ੑabcdefghijklmno",
                "pqrstuvwxyzௗ௰௱௲௹");
        TAMIL_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*।॥\1௦௧௨௩",
                "௪௫௬௭௮௯௳௴{}௵௶௷௸௺\\",
                "\0\0\0\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        TELUGU_LS = join(
                "ఁంఃఅఆఇఈఉఊఋ\nఌ\0\rఎఏ",
                "ఐ\0ఒఓఔకఖగఘఙచ\1ఛజఝఞ",
                " !టఠడఢణత)(థద,ధ.న",
                "0123456789:;\0పఫ?",
                "బభమయరఱలళ\0వశషసహ\0ఽ",
                "ాిీుూృౄ\0ెేై\0ొోౌ్",
                "ౕabcdefghijklmno",
                "pqrstuvwxyzౖౠౡౢౣ");
        TELUGU_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*\0\0\1౦౧౨౩",
                "౪౫౬౭౮౯ౘౙ{}౸౹౺౻౼\\",
                "౽౾౿\0\0\0\0\0\0\0\0\0[~]\0",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        URDU_LS = join(
                "اآبٻڀپڦتۂٿ\nٹٽ\rٺټ",
                "ثجځڄڃڅچڇحخد\1ڌڈډڊ",
                " !ڏڍذرڑړ)(ڙز,ږ.ژ",
                "0123456789:;ښسش?",
                "صضطظعفقکڪګگڳڱلمن",
                "ںڻڼوۄەہھءیېےٍُِٗ",
                "ٔabcdefghijklmno",
                "pqrstuvwxyzّٰٕٖٓ");
        URDU_SS = join(
                "@£$¥¿\"¤%&'\f*+\b-/",
                "<=>¡^¡_#*؀؁\1۰۱۲۳",
                "۴۵۶۷۸۹،؍{}؎؏ؐؑؒ\\",
                "ؓؔ؛؟ـْ٘٫٬ٲٳۍ[~]۔",
                "|ABCDEFGHIJKLMNO",
                "PQRSTUVWXYZ\0\0\0\0\0",
                "\0\0\0\0\0€\0\0\0\0\0\0\0\0\0\0",
                "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
   }

   public static String join(String... parts) {
        StringBuilder b = new StringBuilder();
        // if (parts.length != 8) {
        //     throw new IllegalArgumentException("l=" + parts.length);
        // }
        for (int i = 0; i < parts.length; ++i) {
            // if (parts[i].length() != 16) {
            //     throw new IllegalArgumentException(format("[%d].l = %d", i, parts[i].length()));
            // }
            b.append(parts[i]);
        }
        return b.toString();
   }

    public static Map<Character, Byte> reverse(String table) {
        if (table == null) return null;
        if (table.length() != 128) throw new IllegalArgumentException("Table length == " + table.length());
        Map<Character, Byte> out = new HashMap<>();
        for (int i = 0; i < 128; ++i) {
            char c = table.charAt(i);
            if (c > EXT_CODE) {
                out.put(c, (byte) i);
            }
            // else if (c == EXT_CODE) {
            //     if (!tableLong.containsKey((byte) i)) {
            //         throw new IllegalArgumentException(format("No long variant for %02x", i));
            //     }
            // }
        }
        return Map.copyOf(out);
    }

    public static Set<Character> firstChars(Map<Byte, String> of) {
        Set<Character> out = new HashSet<>();
        for (String s : of.values()) {
            out.add(s.charAt(0));
        }
        return Set.copyOf(out);
    }

    public static Map<String, Byte> flip(Map<Byte, String> map) {
        LinkedHashMap<String, Byte> flipped = new LinkedHashMap<>();
        for (Map.Entry<Byte, String> entry : map.entrySet()) {
            // if (entry.getValue().length() != 2) {
            //     throw new IllegalArgumentException(format("%02x -> '%s' l=%d",
            //             entry.getKey(), entry.getValue(), entry.getValue().length()));
            // }
            // if (ssTable.charAt(entry.getKey()) != EXT_CODE) {
            //     throw new IllegalArgumentException(format("%02x is not ext : \"%s\", is '%c'",
            //             entry.getKey(), entry.getValue(), ssTable.charAt(entry.getKey())));
            // }
            flipped.put(entry.getValue(), entry.getKey());
        }
        return Map.copyOf(flipped);
    }
}