MoreCharsetsProvider.java
/*
* Copyright (c) 2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.enc;
import java.nio.charset.Charset;
import java.nio.charset.spi.CharsetProvider;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.forIso639;
/**
* Charset provider for GSM, UCS2, T.61 and TBCD charsets.
*/
public class MoreCharsetsProvider extends CharsetProvider {
/**
* Create the charset provider.
*/
public MoreCharsetsProvider() {}
private static final List<Charset> CHARSETS = List.of(
T61Charset.T61,
TBCDCharset.TBCD,
TBCDCharset.TBCD_ODD,
GSMCharset.UCS2,
GSMCharset.GSM,
GSMCharset.GSM_Turkish,
GSMCharset.GSM_Spanish,
GSMCharset.GSM_Portuguese,
GSMCharset.GSM_Bengali,
GSMCharset.GSM_Gujarati,
GSMCharset.GSM_Hindi,
GSMCharset.GSM_Kannada,
GSMCharset.GSM_Malayalam,
GSMCharset.GSM_Oriya,
GSMCharset.GSM_Punjabi,
GSMCharset.GSM_Tamil,
GSMCharset.GSM_Telugu,
GSMCharset.GSM_Urdu);
@Override
public Iterator<Charset> charsets() {
return CHARSETS.iterator();
}
@Override
public Charset charsetForName(String name) {
switch (name) {
case "T.61":
case "CP1036":
case "IBM-01036":
return T61Charset.T61;
case "BCD":
case "TBCD":
return TBCDCharset.TBCD;
case "BCD-odd":
case "TBCD-odd":
return TBCDCharset.TBCD_ODD;
case "UCS2":
return GSMCharset.UCS2;
case "GSM":
case "3GPP-23.038":
return GSMCharset.GSM;
case "GSM-tr":
case "3GPP-23.038-tr+tr":
return GSMCharset.GSM_Turkish;
case "GSM-es":
case "3GPP-23.038+es":
return GSMCharset.GSM_Spanish;
case "GSM-pt":
case "3GPP-23.038-pt+pt":
return GSMCharset.GSM_Portuguese;
case "GSM-bn":
case "3GPP-23.038-be+be":
return GSMCharset.GSM_Bengali;
case "GSM-gu":
case "3GPP-23.038-gu+gu":
return GSMCharset.GSM_Gujarati;
case "GSM-hi":
case "3GPP-23.038-hi+hi":
return GSMCharset.GSM_Hindi;
case "GSM-kn":
case "3GPP-23.038-kn+kn":
return GSMCharset.GSM_Kannada;
case "GSM-ml":
case "3GPP-23.038-ml+ml":
return GSMCharset.GSM_Malayalam;
case "GSM-or":
case "3GPP-23.038-or+or":
return GSMCharset.GSM_Oriya;
case "GSM-pa":
case "3GPP-23.038-pa+pa":
return GSMCharset.GSM_Punjabi;
case "GSM-ta":
case "3GPP-23.038-ta+ta":
return GSMCharset.GSM_Tamil;
case "GSM-te":
case "3GPP-23.038-te+te":
return GSMCharset.GSM_Telugu;
case "GSM-ur":
case "3GPP-23.038-ur+ur":
return GSMCharset.GSM_Urdu;
default:
if (name.startsWith("3GPP-23.038")) {
Matcher matcher = LOCALE_3GPP_PATTERN.matcher(name);
if (matcher.matches()) {
var locking = forIso639(matcher.group("locking"));
var shift = forIso639(matcher.group("shift"));
return GSMCharset.forNationalLanguageIdentifier(locking, shift);
}
}
return null;
}
}
private static final Pattern LOCALE_3GPP_PATTERN = Pattern.compile(
"3GPP-23\\.038" +
"(-(?<locking>[a-z][a-z][a-z]?))?" +
"(\\+(?<shift>[a-z][a-z][a-z]?))?");
}