GSMCharset.java
/*
* Copyright (c) 2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.enc;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.Normalizer;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.util.Objects.requireNonNull;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Bengali;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Default;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Gujarati;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Hindi;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Kannada;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Malayalam;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Oriya;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Portuguese;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Punjabi;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Spanish;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Tamil;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Telugu;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Turkish;
import static net.morimekta.strings.enc.GSMNationalLanguageIdentifier.Urdu;
import static net.morimekta.strings.internal.GSMCharsetUtil.EXT_CODE;
/**
* The GSM Charset is actually a set of charsets that uses the
* same rules for how to encode, but with different character
* tables based on preset language settings.
* <p>
* Also known as <b>GSM 03.38</b> or <b>3GPP 23.038</b>.
* <p>
* See <a href="https://en.wikipedia.org/wiki/GSM_03.38#GSM_7-bit_default_alphabet_and_extension_table_of_3GPP_TS_23.038_/_GSM_03.38">GSM 03.38</a> on Wikipedia.<br>
* See <a href="https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/17.00.00_60/ts_123038v170000p.pdf">3GPP TS 23.038 v 17.0</a> on ETSI.org.<br>
*/
public class GSMCharset extends Charset {
/**
* The UCS2 charset. Similar to UTF-16, but originally did not support
* surrogate pairs, and is always big endian (no endian marker on encoded
* stream).
*/
public static final Charset UCS2 = Charset.forName("ISO-10646-UCS-2");
/**
* The Default GSM charset (standard western NLI)
*/
public static final GSMCharset GSM = new GSMCharset();
/**
* The Turkish GSM charset
*/
public static final GSMCharset GSM_Turkish = new GSMCharset(Turkish, Turkish);
/**
* The Spanish GSM charset
*/
public static final GSMCharset GSM_Spanish = new GSMCharset(Default, Spanish);
/**
* The Portuguese GSM charset
*/
public static final GSMCharset GSM_Portuguese = new GSMCharset(Portuguese, Portuguese);
/**
* The Bengali GSM charset
*/
public static final GSMCharset GSM_Bengali = new GSMCharset(Bengali, Bengali);
/**
* The Gujarati GSM charset
*/
public static final GSMCharset GSM_Gujarati = new GSMCharset(Gujarati, Gujarati);
/**
* The Hindi GSM charset
*/
public static final GSMCharset GSM_Hindi = new GSMCharset(Hindi, Hindi);
/**
* The Kannada GSM charset
*/
public static final GSMCharset GSM_Kannada = new GSMCharset(Kannada, Kannada);
/**
* The Malayalam GSM charset
*/
public static final GSMCharset GSM_Malayalam = new GSMCharset(Malayalam, Malayalam);
/**
* The Oriya GSM charset
*/
public static final GSMCharset GSM_Oriya = new GSMCharset(Oriya, Oriya);
/**
* The Punjabi GSM charset
*/
public static final GSMCharset GSM_Punjabi = new GSMCharset(Punjabi, Punjabi);
/**
* The Tamil GSM charset
*/
public static final GSMCharset GSM_Tamil = new GSMCharset(Tamil, Tamil);
/**
* The Telugu GSM charset
*/
public static final GSMCharset GSM_Telugu = new GSMCharset(Telugu, Telugu);
/**
* The Urdu GSM charset
*/
public static final GSMCharset GSM_Urdu = new GSMCharset(Urdu, Urdu);
/**
* Set of all GSM charsets default NLI combinations.
*/
public static final Set<GSMCharset> GSM_CHARSETS = Set.of(
GSM, GSM_Turkish, GSM_Spanish, GSM_Portuguese,
GSM_Bengali, GSM_Gujarati, GSM_Hindi, GSM_Kannada, GSM_Malayalam,
GSM_Oriya, GSM_Punjabi, GSM_Tamil, GSM_Telugu, GSM_Urdu);
private final GSMNationalLanguageIdentifier lockingShift;
private final GSMNationalLanguageIdentifier singleShift;
/**
* Detect best fitting GSM charset for the text. It will match
* the text to the best pair of same-locale static and single
* shift language identifiers, so not necessary the true optimal.
* Will fall back to UCS2 (UTF16-BE) if no GSM charset available.
*
* @param text The text to detect charset for.
* @return The detected charset.
*/
public static Charset detectCharset(String text) {
return detectCharset(text, GSM_CHARSETS);
}
/**
* Detect best fitting GSM charset for the text. It will match
* the text to the best pair of same-locale static and single
* shift language identifiers, so not necessary the true optimal.
* Will fall back to UCS2 (UTF16-BE) if no GSM charset available.
*
* @param text The text to detect charset for.
* @param alt Alternative GSM charsets to check.
* @return The detected charset.
*/
public static Charset detectCharset(String text, Set<GSMCharset> alt) {
requireNonNull(text, "text == null");
requireNonNull(alt, "alt == null");
if (text.length() == 0) {
return GSM;
}
Set<GSMCharset> available = new HashSet<>(alt);
available.removeIf(cs -> !cs.canEncode(text));
if (available.contains(GSM)) {
return GSM;
}
if (available.isEmpty()) {
return UCS2;
}
return available.stream()
.min(Comparator.comparingInt(GSMCharset::modifications))
.get();
}
/**
* Get GSM charset for the matching locales.
*
* @param lockingLocale The main locking locale.
* @param extLocale The ext (escape) locale.
* @return The charset.
*/
public static GSMCharset forNationalLanguageIdentifier(GSMNationalLanguageIdentifier lockingLocale,
GSMNationalLanguageIdentifier extLocale) {
requireNonNull(lockingLocale, "lockingLocale == null");
requireNonNull(extLocale, "extLocale == null");
if (lockingLocale == Default && extLocale == Default) {
return GSM;
}
for (GSMCharset charset : GSM_CHARSETS) {
if (charset.lockingShift == lockingLocale && charset.singleShift == extLocale) {
return charset;
}
}
return new GSMCharset(lockingLocale, extLocale);
}
private GSMCharset() {
this(Default, Default);
}
private GSMCharset(GSMNationalLanguageIdentifier lockingLocale,
GSMNationalLanguageIdentifier extLocale) {
super(makeCharsetCanonicalName(lockingLocale, extLocale),
makeCharsetAliasNames(lockingLocale, extLocale));
requireNonNull(lockingLocale.basic, "No basic character table for " + lockingLocale.name());
this.lockingShift = lockingLocale;
this.singleShift = extLocale;
}
/**
* Get the locking language identifier.
*
* @return Get the locking language identifier, indicating the main charset.
*/
public GSMNationalLanguageIdentifier getLockingShift() {
return lockingShift;
}
/**
* Get the single shift language identifier.
*
* @return Get the single shift language identifier, indicating the extended charset.
*/
public GSMNationalLanguageIdentifier getSingleShift() {
return singleShift;
}
/**
* Get the number of modifications relative to full default. Useful for finding the simplest
* charset compatible with a string. See {@link GSMCharset#detectCharset(String)} for details.
*
* @return The number of modification points from default.
*/
public int modifications() {
return (lockingShift == Default ? 0 : 1) +
(singleShift == Default ? 0 : 2);
}
/**
* If the charset combination can encode the given single character.
*
* @param c The char to check.
* @return If the char can be encoded.
*/
public boolean canEncode(char c) {
if (c <= EXT_CODE) return false;
return lockingShift.basicMap.containsKey(c) ||
singleShift.shiftMap.containsKey(c);
}
/**
* Check if this charset can encode the provided char sequence.
*
* @param cs The string to check.
* @return If the string can be fully encoded with the charset.
*/
public boolean canEncode(CharSequence cs) {
for (int i = 0; i < cs.length(); ++i) {
char c = cs.charAt(i);
if (c <= EXT_CODE) return false;
if (lockingShift.basicExtended.contains(c) && i < (cs.length() - 1)) {
char n = cs.charAt(i + 1);
if (lockingShift.basicStringCode.containsKey("" + c + n)) {
++i;
continue;
}
}
if (singleShift.shiftExtended.contains(c) && i < (cs.length() - 1)) {
char n = cs.charAt(i + 1);
if (singleShift.shiftStringCode.containsKey("" + c + n)) {
++i;
continue;
}
}
if (lockingShift.basicMap.containsKey(c) ||
singleShift.shiftMap.containsKey(c)) {
continue;
}
return false;
}
return true;
}
/**
* Normalize the text into the closest string to what can be encoded without errors by the
* charset.
*
* @param text The string to normalize.
* @param replacement The replacement character if unable to normalize to a known partial
* replacement of an unmappable char.
* @return The string with unmappable characters replaced with the closest equivalent if
* possible, or skipped if unable to normalize.
*/
public String normalize(CharSequence text, char replacement) {
if (!canEncode(replacement)) {
throw new IllegalArgumentException("Unable to encode replacement");
}
if (canEncode(text)) {
return text.toString();
}
ByteBuffer rb = encode(String.valueOf(replacement));
var normalized = Normalizer.normalize(text, Normalizer.Form.NFKC);
ByteBuffer bb = ByteBuffer.allocate(Math.max(128, normalized.length()));
CharBuffer cb = CharBuffer.wrap(normalized);
CharsetEncoder encoder = newEncoder();
CoderResult result;
while (!(result = encoder.encode(cb, bb, false)).isUnderflow()) {
if (result.isUnmappable()) {
var c = cb.charAt(0);
if (!isLowSurrogate(c) && !isHighSurrogate(c)) {
var decomposed = Normalizer.normalize(String.valueOf(c), Normalizer.Form.NFKD);
boolean replaced = false;
for (var d : decomposed.toCharArray()) {
if (canEncode(d)) {
encoder.encode(CharBuffer.wrap(String.valueOf(d)), bb, false);
replaced = true;
}
}
if (!replaced) {
bb.put(rb);
rb.flip();
}
// and skip the
} else {
bb.put(rb);
rb.flip();
}
cb.position(cb.position() + result.length());
} else if (result.isOverflow()) {
bb = ByteBuffer.allocate(bb.capacity() * 2).put(bb.flip());
}
}
var buffer = new byte[bb.position()];
bb.flip();
bb.get(buffer);
return new String(buffer, this);
}
@Override
public boolean contains(Charset charset) {
if (charset instanceof GSMCharset) {
GSMCharset other = (GSMCharset) charset;
return other.lockingShift == lockingShift &&
other.singleShift == singleShift;
}
return false;
}
@Override
public CharsetDecoder newDecoder() {
return new GSMCharsetDecoder(this);
}
@Override
public CharsetEncoder newEncoder() {
return new GSMCharsetEncoder(this);
}
private static String makeCharsetCanonicalName(
GSMNationalLanguageIdentifier lockingLocale,
GSMNationalLanguageIdentifier extLocale) {
return "3GPP-23.038" +
(requireNonNull(lockingLocale, "lockingLocale == null") == Default ? "" : "-" + lockingLocale.iso639_1) +
(requireNonNull(extLocale, "extLocale == null") == Default ? "" : "+" + extLocale.iso639_1);
}
private static String[] makeCharsetAliasNames(
GSMNationalLanguageIdentifier lockingLocale,
GSMNationalLanguageIdentifier extLocale) {
return extLocale.iso639_1 != null
&& (lockingLocale == extLocale || (lockingLocale == Default && extLocale == Spanish))
? new String[]{"GSM-" + extLocale.iso639_1}
: lockingLocale == Default && extLocale == Default
? new String[]{"GSM"}
: new String[0];
}
}