MoreCharsetsProvider.java

  1. /*
  2.  * Copyright (c) 2020, Stein Eldar Johnsen
  3.  *
  4.  * Licensed to the Apache Software Foundation (ASF) under one
  5.  * or more contributor license agreements. See the NOTICE file
  6.  * distributed with this work for additional information
  7.  * regarding copyright ownership. The ASF licenses this file
  8.  * to you under the Apache License, Version 2.0 (the
  9.  * "License"); you may not use this file except in compliance
  10.  * with the License. You may obtain a copy of the License at
  11.  *
  12.  *   http://www.apache.org/licenses/LICENSE-2.0
  13.  *
  14.  * Unless required by applicable law or agreed to in writing,
  15.  * software distributed under the License is distributed on an
  16.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  17.  * KIND, either express or implied. See the License for the
  18.  * specific language governing permissions and limitations
  19.  * under the License.
  20.  */
  21. package net.morimekta.strings.enc;

  22. import java.nio.charset.Charset;
  23. import java.nio.charset.spi.CharsetProvider;
  24. import java.util.Iterator;
  25. import java.util.List;
  26. import java.util.regex.Matcher;
  27. import java.util.regex.Pattern;

  28. import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.forIso639;

  29. /**
  30.  * Charset provider for GSM, UCS2, T.61 and TBCD charsets.
  31.  */
  32. public class MoreCharsetsProvider extends CharsetProvider {
  33.     /**
  34.      * Create the charset provider.
  35.      */
  36.     public MoreCharsetsProvider() {}

  37.     private static final List<Charset> CHARSETS = List.of(
  38.             T61Charset.T61,
  39.             TBCDCharset.TBCD,
  40.             TBCDCharset.TBCD_ODD,
  41.             GSMCharset.UCS2,
  42.             GSMCharset.GSM,
  43.             GSMCharset.GSM_Turkish,
  44.             GSMCharset.GSM_Spanish,
  45.             GSMCharset.GSM_Portuguese,
  46.             GSMCharset.GSM_Bengali,
  47.             GSMCharset.GSM_Gujarati,
  48.             GSMCharset.GSM_Hindi,
  49.             GSMCharset.GSM_Kannada,
  50.             GSMCharset.GSM_Malayalam,
  51.             GSMCharset.GSM_Oriya,
  52.             GSMCharset.GSM_Punjabi,
  53.             GSMCharset.GSM_Tamil,
  54.             GSMCharset.GSM_Telugu,
  55.             GSMCharset.GSM_Urdu);

  56.     @Override
  57.     public Iterator<Charset> charsets() {
  58.         return CHARSETS.iterator();
  59.     }

  60.     @Override
  61.     public Charset charsetForName(String name) {
  62.         switch (name) {
  63.             case "T.61":
  64.             case "CP1036":
  65.             case "IBM-01036":
  66.                 return T61Charset.T61;
  67.             case "BCD":
  68.             case "TBCD":
  69.                 return TBCDCharset.TBCD;
  70.             case "BCD-odd":
  71.             case "TBCD-odd":
  72.                 return TBCDCharset.TBCD_ODD;
  73.             case "UCS2":
  74.                 return GSMCharset.UCS2;
  75.             case "GSM":
  76.             case "3GPP-23.038":
  77.                 return GSMCharset.GSM;
  78.             case "GSM-tr":
  79.             case "3GPP-23.038-tr+tr":
  80.                 return GSMCharset.GSM_Turkish;
  81.             case "GSM-es":
  82.             case "3GPP-23.038+es":
  83.                 return GSMCharset.GSM_Spanish;
  84.             case "GSM-pt":
  85.             case "3GPP-23.038-pt+pt":
  86.                 return GSMCharset.GSM_Portuguese;
  87.             case "GSM-bn":
  88.             case "3GPP-23.038-be+be":
  89.                 return GSMCharset.GSM_Bengali;
  90.             case "GSM-gu":
  91.             case "3GPP-23.038-gu+gu":
  92.                 return GSMCharset.GSM_Gujarati;
  93.             case "GSM-hi":
  94.             case "3GPP-23.038-hi+hi":
  95.                 return GSMCharset.GSM_Hindi;
  96.             case "GSM-kn":
  97.             case "3GPP-23.038-kn+kn":
  98.                 return GSMCharset.GSM_Kannada;
  99.             case "GSM-ml":
  100.             case "3GPP-23.038-ml+ml":
  101.                 return GSMCharset.GSM_Malayalam;
  102.             case "GSM-or":
  103.             case "3GPP-23.038-or+or":
  104.                 return GSMCharset.GSM_Oriya;
  105.             case "GSM-pa":
  106.             case "3GPP-23.038-pa+pa":
  107.                 return GSMCharset.GSM_Punjabi;
  108.             case "GSM-ta":
  109.             case "3GPP-23.038-ta+ta":
  110.                 return GSMCharset.GSM_Tamil;
  111.             case "GSM-te":
  112.             case "3GPP-23.038-te+te":
  113.                 return GSMCharset.GSM_Telugu;
  114.             case "GSM-ur":
  115.             case "3GPP-23.038-ur+ur":
  116.                 return GSMCharset.GSM_Urdu;
  117.             default:
  118.                 if (name.startsWith("3GPP-23.038")) {
  119.                     Matcher matcher = LOCALE_3GPP_PATTERN.matcher(name);
  120.                     if (matcher.matches()) {
  121.                         var locking = forIso639(matcher.group("locking"));
  122.                         var shift = forIso639(matcher.group("shift"));
  123.                         return GSMCharset.forNationalLanguageIdentifier(locking, shift);
  124.                     }
  125.                 }
  126.                 return null;
  127.         }
  128.     }

  129.     private static final Pattern LOCALE_3GPP_PATTERN = Pattern.compile(
  130.             "3GPP-23\\.038" +
  131.             "(-(?<locking>[a-z][a-z][a-z]?))?" +
  132.             "(\\+(?<shift>[a-z][a-z][a-z]?))?");
  133. }