T61Charset.java
/*
* Copyright (c) 2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.enc;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.Normalizer;
import static java.text.Normalizer.Form.NFKC;
/**
* The T.61 charset, a.k.a. Teletex is an ITU-T encoding standard used some
* places in the telco world, though has been phased out in places in favor of
* ASCII (called IA5, or string encoding type 5) and UTF-8. This it is still
* part of ASN.1 encoding standard, so still in use some places (mainly legacy
* systems).
* <p>
* The encoding set contains a subset of US-ASCII characters, plus a number of
* selected diacritics and special characters used in major latin based
* languages.
* <p>
* - <a href="https://en.wikipedia.org/wiki/ITU_T.61">Wikipedia on T.61</a><br>
* - <a href="https://en.wikipedia.org/wiki/Teletex">Wikipedia on Teletex</a>
*/
public class T61Charset extends Charset {
/** The T.61 charset instance. */
public static final Charset T61 = new T61Charset();
private static final String MAPPING =
"\0\0\0\0\0\0\0\0" + "\b\0\n\0\f\r\016\017" +
"\0\0\0\0\0\0\0\0" + "\0\u008e\u001a\u001b\0\u008f\0\0" +
" !\"\0\0%&'" + "()*+,-./" +
"01234567" + "89:;<=>?" +
"@ABCDEFG" + "HIJKLMNO" +
"PQRSTUVW" + "XYZ[\0]\0_" +
"\0abcdefg" + "hijklmno" +
"pqrstuvw" + "xyz\0|\0\0\177" +
"\0\0\0\0\0\0\0\0" + "\0\0\0\u008b\u008c\0\0\0" +
"\0\0\0\0\0\0\0\0" + "\0\0\0\u009b\0\0\0\0" +
"\u00A0¡¢£$¥#§" + "¤\0\0«\0\0\0\0" +
"°±²³×µ¶·" + "÷\0\0»¼½¾¿" +
"\0\u0300\u0301\u0302\u0303\u0304\u0306\u0307" + "\u0308\u0308\u030a\u0327\u0332\u030b\u0328\u030c" +
"\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0" +
"ΩÆЪĦ\0IJĿ" + "ŁØŒºÞŦŊʼn" +
"ĸæđðħıijŀ" + "łøœßþŧŋ\0";
private T61Charset() {
super("T.61", new String[]{"IBM-01036", "CP1036"});
}
@Override
public boolean contains(Charset charset) {
// This is not QUITE a subset of US-ASCII, and US-ASCII is not QUITE
// a subset of this.
return charset instanceof T61Charset;
}
@Override
public CharsetDecoder newDecoder() {
return new Decoder();
}
@Override
public CharsetEncoder newEncoder() {
return new Encoder();
}
static class Decoder extends CharsetDecoder {
protected Decoder() {
super(T61, 1, 1);
}
@Override
protected CoderResult decodeLoop(ByteBuffer byteBuffer, CharBuffer charBuffer) {
while (byteBuffer.hasRemaining()) {
if (!charBuffer.hasRemaining()) {
return CoderResult.OVERFLOW;
}
byte b = byteBuffer.get();
char c = charAt(b);
if (c == '\0') {
byteBuffer.position(byteBuffer.position() - 1);
return CoderResult.unmappableForLength(1);
}
// T.61 diacritics come *before* the letter, not after as in UCS.
if (c >= '\u0300' && c <= '\u0340') {
if (byteBuffer.hasRemaining()) {
byte nb = byteBuffer.get();
char nc = charAt(nb);
String tmp = Normalizer.normalize(new String(new char[]{nc, c}), NFKC);
charBuffer.put(tmp);
continue;
} else {
byteBuffer.position(byteBuffer.position() - 1);
return CoderResult.unmappableForLength(1);
}
}
charBuffer.put(c);
}
return CoderResult.UNDERFLOW;
}
}
static class Encoder extends CharsetEncoder {
protected Encoder() {
super(T61, 1f, 2f, new byte[]{'?'});
}
@Override
public boolean canEncode(char c) {
return indexOf(c) >= 0;
}
@Override
public boolean canEncode(CharSequence cs) {
return cs.chars().allMatch(c -> canEncode((char) c));
}
@Override
protected CoderResult encodeLoop(CharBuffer charBuffer, ByteBuffer byteBuffer) {
while (charBuffer.hasRemaining()) {
char c = charBuffer.get();
if (Character.isHighSurrogate(c)) {
// No surrogate pairs are supported.
if (charBuffer.hasRemaining()) {
char d = charBuffer.get();
if (Character.isLowSurrogate(d)) {
charBuffer.position(charBuffer.position() - 2);
return CoderResult.unmappableForLength(2);
}
charBuffer.position(charBuffer.position() - 2);
} else {
charBuffer.position(charBuffer.position() - 1);
}
return CoderResult.unmappableForLength(1);
} else if (Character.isLowSurrogate(c)) {
charBuffer.position(charBuffer.position() - 1);
return CoderResult.unmappableForLength(1);
}
String expanded = Normalizer.normalize(String.valueOf(c), Normalizer.Form.NFKD);
if (expanded.length() > byteBuffer.remaining()) {
charBuffer.position(charBuffer.position() - 1);
return CoderResult.OVERFLOW;
}
for (int i = 0; i < expanded.length(); ++i) {
char cc = expanded.charAt(i);
int ci = indexOf(cc);
if (cc == 0 || ci < 0) {
charBuffer.position(charBuffer.position() - 1);
return CoderResult.unmappableForLength(1);
}
if (byteBuffer.position() > 0 && cc >= '\u0300' && cc <= '\u0340') {
// we must swap the order of char and matching diacritic.
byte lb = byteBuffer.get(byteBuffer.position() - 1);
if (lb > 0) {
byteBuffer.position(byteBuffer.position() - 1);
byteBuffer.put((byte) ci).put(lb);
continue;
}
}
byteBuffer.put((byte) ci);
}
}
return CoderResult.UNDERFLOW;
}
}
static int indexOf(char c) {
if (c < 0x100 && MAPPING.charAt(c) == c) return c;
return MAPPING.indexOf(c);
}
static char charAt(int i) {
if (i < 0) i = 0x100 + i;
return MAPPING.charAt(i);
}
}