MoreCharsetsProvider.java

/*
 * Copyright (c) 2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings.enc;

import java.nio.charset.Charset;
import java.nio.charset.spi.CharsetProvider;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.forIso639;

/**
 * Charset provider for GSM, UCS2, T.61 and TBCD charsets.
 */
public class MoreCharsetsProvider extends CharsetProvider {
    private static final List<Charset> CHARSETS = List.of(
                T61Charset.T61,
                TBCDCharset.TBCD,
                TBCDCharset.TBCD_ODD,
                GSMCharset.UCS2,
                GSMCharset.GSM,
                GSMCharset.GSM_Turkish,
                GSMCharset.GSM_Spanish,
                GSMCharset.GSM_Portuguese,
                GSMCharset.GSM_Bengali,
                GSMCharset.GSM_Gujarati,
                GSMCharset.GSM_Hindi,
                GSMCharset.GSM_Kannada,
                GSMCharset.GSM_Malayalam,
                GSMCharset.GSM_Oriya,
                GSMCharset.GSM_Punjabi,
                GSMCharset.GSM_Tamil,
                GSMCharset.GSM_Telugu,
                GSMCharset.GSM_Urdu);

    @Override
    public Iterator<Charset> charsets() {
        return CHARSETS.iterator();
    }

    @Override
    public Charset charsetForName(String name) {
        switch (name) {
            case "T.61":
            case "CP1036":
            case "IBM-01036":
                return T61Charset.T61;
            case "BCD":
            case "TBCD":
                return TBCDCharset.TBCD;
            case "BCD-odd":
            case "TBCD-odd":
                return TBCDCharset.TBCD_ODD;
            case "UCS2":
                return GSMCharset.UCS2;
            case "GSM":
            case "3GPP-23.038":
                return GSMCharset.GSM;
            case "GSM-tr":
            case "3GPP-23.038-tr+tr":
                return GSMCharset.GSM_Turkish;
            case "GSM-es":
            case "3GPP-23.038+es":
                return GSMCharset.GSM_Spanish;
            case "GSM-pt":
            case "3GPP-23.038-pt+pt":
                return GSMCharset.GSM_Portuguese;
            case "GSM-bn":
            case "3GPP-23.038-be+be":
                return GSMCharset.GSM_Bengali;
            case "GSM-gu":
            case "3GPP-23.038-gu+gu":
                return GSMCharset.GSM_Gujarati;
            case "GSM-hi":
            case "3GPP-23.038-hi+hi":
                return GSMCharset.GSM_Hindi;
            case "GSM-kn":
            case "3GPP-23.038-kn+kn":
                return GSMCharset.GSM_Kannada;
            case "GSM-ml":
            case "3GPP-23.038-ml+ml":
                return GSMCharset.GSM_Malayalam;
            case "GSM-or":
            case "3GPP-23.038-or+or":
                return GSMCharset.GSM_Oriya;
            case "GSM-pa":
            case "3GPP-23.038-pa+pa":
                return GSMCharset.GSM_Punjabi;
            case "GSM-ta":
            case "3GPP-23.038-ta+ta":
                return GSMCharset.GSM_Tamil;
            case "GSM-te":
            case "3GPP-23.038-te+te":
                return GSMCharset.GSM_Telugu;
            case "GSM-ur":
            case "3GPP-23.038-ur+ur":
                return GSMCharset.GSM_Urdu;
            default:
                if (name.startsWith("3GPP-23.038")) {
                    Matcher matcher = LOCALE_3GPP_PATTERN.matcher(name);
                    if (matcher.matches()) {
                        var locking = forIso639(matcher.group("locking"));
                        var shift = forIso639(matcher.group("shift"));
                        return GSMCharset.forNationalLanguageIdentifier(locking, shift);
                    }
                }
                return null;
        }
    }

    private static final Pattern LOCALE_3GPP_PATTERN = Pattern.compile(
            "3GPP-23\\.038" +
                    "(-(?<locking>[a-z][a-z][a-z]?))?" +
                    "(\\+(?<shift>[a-z][a-z][a-z]?))?");
}