EscapeUtil.java
/*
* Copyright (c) 2020, Stein Eldar Johnsen
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package net.morimekta.strings;
import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.lang.Character.toCodePoint;
import static net.morimekta.strings.ConsoleUtil.isConsolePrintable;
/**
* Utility for escaping and un-escaping strings Java style.
*/
public final class EscapeUtil {
/**
* Properly java-escape the string for including in java strings or
* printing with escape symbols to console.
*
* @param string The string to escape.
* @return The escaped string.
*/
public static String javaEscape(CharSequence string) {
StringBuilder builder = new StringBuilder(string.length());
for (int i = 0; i < string.length(); ++i) {
char c1 = string.charAt(i);
if (isHighSurrogate(c1)) {
if (i + 1 < string.length()) {
char c2 = string.charAt(i + 1);
if (isLowSurrogate(c2) && isConsolePrintable(toCodePoint(c1, c2))) {
builder.append(c1).append(c2);
++i;
continue;
}
}
}
builder.append(javaEscape(c1));
}
return builder.toString();
}
/**
* Escape a single character. It is escaped into a string, as it may become
* more than one char when escaped. The char is escaped in a way that can be
* included in a java char
*
* @param c The char to escape.
* @return The escaped char string.
*/
public static String javaEscape(char c) {
switch (c) {
case '\b':
return "\\b";
case '\t':
return "\\t";
case '\n':
return "\\n";
case '\f':
return "\\f";
case '\r':
return "\\r";
case '"':
return "\\\"";
case '\'':
return "\\'";
case '\\':
return "\\\\";
default:
if (c < 32 || c == 127) {
return String.format("\\%03o", (int) c);
} else if (!isConsolePrintable(c) ||
isHighSurrogate(c) ||
isLowSurrogate(c)) {
return String.format("\\u%04x", (int) c);
}
return String.valueOf(c);
}
}
/**
* UnEscape the char sequence using javas escape syntax used above and in
* java strings.
*
* @param str The string to un-escape.
* @return The un-escaped string.
*/
public static String javaUnEscape(CharSequence str) {
return javaUnEscape(str, false);
}
/**
* UnEscape the char sequence using javas escape syntax used above and in
* java strings.
*
* @param str The string to un-escape.
* @param strict If it should validate string content strictly.
* @return Slice decoded as UTF_8 string handling escaped characters.
*/
public static String javaUnEscape(CharSequence str, boolean strict) {
final int l = str.length();
StringBuilder out = new StringBuilder(l);
char surrogate = 0;
boolean esc = false;
for (int i = 0; i < l; ++i) {
char ch = str.charAt(i);
if (surrogate > 0) {
if (esc) {
if (ch == 'u') {
if (l < i + 5) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped unicode char: '\\" +
javaEscape(str.subSequence(i, l)) +
"'");
}
out.append("��");
} else {
String n = str.subSequence(i + 1, i + 5).toString();
try {
ch = (char) Integer.parseInt(n, 16);
if (isLowSurrogate(ch)) {
out.append(surrogate);
out.append(ch);
} else if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched high surrogate char: '\\u%04x'",
(int) surrogate));
} else {
out.append("��");
}
} catch (NumberFormatException e) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped unicode char: '\\u" +
javaEscape(n) +
"'");
}
out.append("��");
}
}
i += 4; // skipping 4 more characters.
surrogate = 0;
continue;
} else {
// mismatch
if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched high surrogate char: '\\u%04x'",
(int) surrogate));
}
out.append("�");
surrogate = 0;
// and fall down to 'normal' handling.
}
} else if (ch == '\\') {
esc = true;
continue;
} else if (isLowSurrogate(ch)) {
out.append(surrogate);
out.append(ch);
surrogate = 0;
continue;
} else {
if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched high surrogate char: '\\u%04x'",
(int) surrogate));
}
out.append("�");
surrogate = 0;
// and fall down to 'normal' handling.
}
}
if (esc) {
esc = false;
switch (ch) {
case 'b':
out.append('\b');
break;
case 'f':
out.append('\f');
break;
case 'n':
out.append('\n');
break;
case 'r':
out.append('\r');
break;
case 't':
out.append('\t');
break;
case '\"':
case '\'':
case '\\':
out.append(ch);
break;
case 'u':
if (l < i + 5) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped unicode char: '\\" +
javaEscape(str.subSequence(i, l)) +
"'");
}
out.append('�');
} else {
String n = str.subSequence(i + 1, i + 5).toString();
try {
char cp = (char) Integer.parseInt(n, 16);
if (isHighSurrogate(cp)) {
surrogate = cp;
} else if (isLowSurrogate(cp)) {
if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched low surrogate char: '\\u%04x'",
(int) cp));
}
out.append("�");
} else {
out.append(cp);
}
} catch (NumberFormatException e) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped unicode char: '\\u" +
javaEscape(n) +
"'");
}
out.append('�');
}
}
i += 4; // skipping 4 more characters.
break;
case '0':
if (l == i + 1 ||
(l > i + 1 && (str.charAt(i + 1) < '0' || str.charAt(i + 1) > '9'))) {
// allow single digit '\0' if the next char is not a digit.
out.append('\0');
break;
}
// Intentional fallthrough
case '1':
if (l < (i + 3)) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped char: '\\" +
javaEscape(str.subSequence(i, l)) +
"'");
}
out.append('�');
} else {
String n = str.subSequence(i, i + 3).toString();
try {
int cp = Integer.parseInt(n, 8);
out.append((char) cp);
} catch (NumberFormatException e) {
if (strict) {
throw new IllegalArgumentException("Invalid escaped char: '\\" +
javaEscape(n) +
"'");
}
out.append("�");
}
}
i += 2; // skipping 2 more characters.
break;
default:
if (strict) {
throw new IllegalArgumentException("Invalid escaped char: '\\" +
javaEscape(String.valueOf(ch)) +
"'");
}
out.append("�");
break;
}
} else if (ch == '\\') {
esc = true;
} else if (isHighSurrogate(ch)) {
surrogate = ch;
} else if (isLowSurrogate(ch)) {
// unmatched low surrogate
if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched low surrogate char: '\\u%04x'",
(int) ch));
}
out.append('�');
} else {
out.append(ch);
}
}
if (surrogate > 0) {
if (strict) {
throw new IllegalArgumentException(String.format(
"Unmatched high surrogate char: '\\u%04x'",
(int) surrogate));
}
out.append("�");
}
return out.toString();
}
private EscapeUtil() {}
}