Utf8StreamWriter.java
/*
* Copyright (c) 2017, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings.io;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.lang.Character.toCodePoint;
/**
* Similar to java native {@link java.io.OutputStreamWriter}, but locked to
* utf-8, and explicitly with no buffering whatsoever, with one exception only, which
* is catching surrogate pair chars. As a result of catching surrogate pairs, it can
* write non-surrogate 31 bit unicode code-points to the stream instead of the pair.
* <p>
* In order to make this writer more efficient, rather wrap the output stream in
* a BufferedOutputStream, which can handle all the byte-level buffering.
* E.g.:
* </p>
* <pre>
* Writer writer = new Utf8StreamWriter(new BufferedOutputStream(out));
* </pre>
*/
public class Utf8StreamWriter extends Writer {
private final int[] buffer;
private final boolean strict;
private OutputStream out;
private char surrogate;
/**
* Make a stream writer with strict unicode surrogate pair matching.
*
* @param out Stream to write to.
*/
public Utf8StreamWriter(OutputStream out) {
this(out, true);
}
/**
* Make a stream writer.
*
* @param out Stream to write to.
* @param strict If unicode surrogate pairs should be matched strictly.
*/
public Utf8StreamWriter(OutputStream out, boolean strict) {
this.out = out;
this.buffer = new int[6];
this.surrogate = 0;
this.strict = strict;
}
@Override
public void write(char[] chars, int off, int len) throws IOException {
if (out == null) {
throw new IOException("Writing to a closed stream.");
}
for (int i = 0; i < len; ++i) {
final char c = chars[off + i];
if (isHighSurrogate(c)) {
if (surrogate != 0) {
if (strict) {
throw new UnsupportedEncodingException("High surrogate " + Integer.toHexString(c) +
" after high: " +
Integer.toHexString(surrogate));
}
out.write('?'); // for the bad high surrogate
}
surrogate = c;
} else if (isLowSurrogate(c)) {
if (surrogate != 0) {
writeCodePoint(toCodePoint(surrogate, c));
surrogate = 0;
} else {
if (strict) {
throw new UnsupportedEncodingException("Missing high surrogate before low: " +
Integer.toHexString(c));
}
out.write('?');
}
} else {
if (surrogate != 0) {
if (strict) {
throw new UnsupportedEncodingException("Missing low surrogate after high: " +
Integer.toHexString(surrogate));
}
out.write('?'); // for the bad high surrogate
}
writeCodePoint(c);
}
}
}
private void writeCodePoint(int cp) throws IOException {
int cp0 = cp;
if (cp < 0x80) {
// ASCII
out.write((byte) cp);
} else {
// UTF-8 entity.
int c = 0;
int lastOverLimit = 0x40;
while (cp >= lastOverLimit) {
buffer[c++] = (cp & 0x3f) | 0x80;
cp >>>= 6;
lastOverLimit >>>= 1;
}
switch (c) {
case 1: buffer[c] = 0xC0 | cp; break;
case 2: buffer[c] = 0xE0 | cp; break;
case 3: buffer[c] = 0xF0 | cp; break;
case 4: buffer[c] = 0xF8 | cp; break;
case 5: buffer[c] = 0xFC | cp; break;
// Should be impossible, fix write() methods if it isn't
default: throw new IOException("Unreachable code reached: " + Integer.toHexString(cp0));
}
// Write the bytes in reverse order to make it big-endian like.
for (int i = c; i >= 0; --i) {
out.write(buffer[i]);
}
}
}
@Override
public void flush() throws IOException {
if (out != null) {
if (strict && surrogate != 0) {
throw new IOException("Surrogate high pair written, but no low");
}
out.flush();
}
}
@Override
public void close() throws IOException {
if (out != null) {
try {
flush();
out.close();
} finally {
out = null;
}
}
}
}