Utf8StreamReader.java
/*
* Copyright (c) 2016, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
/**
* Similar to java native {@link java.io.InputStreamReader}, but locked to
* utf-8, and explicitly with no buffering whatsoever. It will only read one
* byte at a time until it has a valid unicode char.
* <p>
* In order to make this reader more efficient, rather wrap the input stream in
* a BufferedInputStream, which can pass on any buffered bytes to later uses.
* E.g.:
* </p>
* <pre>
* Reader reader = new Utf8StreamReader(new BufferedInputStream(in));
* </pre>
*/
public class Utf8StreamReader extends Reader {
private final int[] buffer;
private final boolean strict;
private InputStream in;
private char surrogate;
/**
* Make a stream reader with strict char parsing.
*
* @param in Stream to read from.
*/
public Utf8StreamReader(InputStream in) {
this(in, true);
}
/**
* Make a stream reader.
*
* @param in Stream to read from.
* @param strict If character parsing should be strict.
*/
public Utf8StreamReader(InputStream in, boolean strict) {
this.in = in;
this.buffer = new int[6];
this.surrogate = 0;
this.strict = strict;
}
@Override
public int read(char[] char_buffer, int off, int len) throws IOException {
if (in == null) {
throw new IOException("Reading from a closed stream.");
}
for (int i = 0; i < len; ++i) {
if (surrogate != 0) {
char_buffer[off + i] = surrogate;
surrogate = 0;
continue;
}
final int r = in.read();
if (r < 0) {
if (i == 0) {
return -1;
}
return i;
} else if (r < 0x80) {
char_buffer[off + i] = (char) r;
} else if ((r & 0xC0) == 0x80) {
// 10xxxxxx: This byte pattern should not be here.
if (strict) {
throw new UnsupportedEncodingException(String.format(Locale.ENGLISH,
"Unexpected utf-8 entity char: 0x%02x",
r));
}
char_buffer[off + i] = '?';
} else if ((r & 0xFE) == 0xFE) {
// invalid utf-8 starting byte.
if (strict) {
throw new UnsupportedEncodingException(String.format(Locale.ENGLISH,
"Unexpected utf-8 non-entity char: 0x%02x",
r));
}
char_buffer[off + i] = '?';
} else {
buffer[0] = r;
int c = 1;
// 110xxxxx + 1 * 10xxxxxx = 11 bit
if ((r & 0xC0) == 0xC0) {
buffer[c++] = in.read();
// 1110xxxx + 2 * 10xxxxxx = 16 bit
if ((r & 0xE0) == 0xE0) {
buffer[c++] = in.read();
// 11110xxx + 3 * 10xxxxxx = 21 bit
if ((r & 0xF0) == 0xF0) {
buffer[c++] = in.read();
// 111110xx + 4 * 10xxxxxx = 26 bit
if ((r & 0xF8) == 0xF8) {
buffer[c++] = in.read();
// 1111110x + 5 * 10xxxxxx = 31 bit
if ((r & 0xFC) == 0xFC) {
buffer[c++] = in.read();
}
}
}
}
}
char_buffer[off + i] = convert(buffer, c);
}
}
return len;
}
@Override
public void close() throws IOException {
try {
if (in != null) {
in.close();
}
} finally {
in = null;
}
}
@Override
public boolean ready() throws IOException {
return in != null && in.available() > 0;
}
private char convert(final int[] arr, final int num) throws IOException {
int cp;
switch (num) {
case 2:
cp = (arr[0] & 0x1f);
break;
case 3:
cp = (arr[0] & 0x0f);
break;
case 4:
cp = (arr[0] & 0x07);
break;
case 5:
cp = (arr[0] & 0x03);
break;
case 6:
cp = (arr[0] & 0x01);
break;
default:
// Should be impossible, but you never know.
// See and fix read() method if this ever happens.
throw new IOException("Unhandled utf-8 char length: " + num);
}
for (int i = 1; i < num; ++i) {
if (arr[i] == -1) {
throw new IOException("End of stream inside utf-8 encoded entity.");
}
if ((arr[i] & 0xC0) != 0x80) {
if (strict) {
throw new UnsupportedEncodingException(String.format(Locale.ENGLISH,
"Unexpected non-entity utf-8 char in entity extra bytes: 0x%02x",
arr[i]));
}
return '?';
}
cp = (cp << 6) | (arr[i] & 0x3f);
}
if (Character.isBmpCodePoint(cp)) {
return (char) cp;
} else {
surrogate = Character.lowSurrogate(cp);
return Character.highSurrogate(cp);
}
}
}