EscapeUtil.java

/*
 * Copyright (c) 2020, Stein Eldar Johnsen
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package net.morimekta.strings;

import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.lang.Character.toCodePoint;
import static net.morimekta.strings.ConsoleUtil.isConsolePrintable;

/**
 * Utility for escaping and un-escaping strings Java style.
 */
public final class EscapeUtil {
    /**
     * Properly java-escape the string for including in java strings or
     * printing with escape symbols to console.
     *
     * @param string The string to escape.
     * @return The escaped string.
     */
    public static String javaEscape(CharSequence string) {
        StringBuilder builder = new StringBuilder(string.length());
        for (int i = 0; i < string.length(); ++i) {
            char c1 = string.charAt(i);
            if (isHighSurrogate(c1)) {
                if (i + 1 < string.length()) {
                    char c2 = string.charAt(i + 1);
                    if (isLowSurrogate(c2) && isConsolePrintable(toCodePoint(c1, c2))) {
                        builder.append(c1).append(c2);
                        ++i;
                        continue;
                    }
                }
            }
            builder.append(javaEscape(c1));
        }
        return builder.toString();
    }

    /**
     * Escape a single character. It is escaped into a string, as it may become
     * more than one char when escaped. The char is escaped in a way that can be
     * included in a java char
     *
     * @param c The char to escape.
     * @return The escaped char string.
     */
    public static String javaEscape(char c) {
        switch (c) {
            case '\b':
                return "\\b";
            case '\t':
                return "\\t";
            case '\n':
                return "\\n";
            case '\f':
                return "\\f";
            case '\r':
                return "\\r";
            case '"':
                return "\\\"";
            case '\'':
                return "\\'";
            case '\\':
                return "\\\\";
            default:
                if (c < 32 || c == 127) {
                    return String.format("\\%03o", (int) c);
                } else if (!isConsolePrintable(c) ||
                           isHighSurrogate(c) ||
                           isLowSurrogate(c)) {
                    return String.format("\\u%04x", (int) c);
                }
                return String.valueOf(c);
        }
    }

    /**
     * UnEscape the char sequence using javas escape syntax used above and in
     * java strings.
     *
     * @param str The string to un-escape.
     * @return The un-escaped string.
     */
    public static String javaUnEscape(CharSequence str) {
        return javaUnEscape(str, false);
    }

    /**
     * UnEscape the char sequence using javas escape syntax used above and in
     * java strings.
     *
     * @param str The string to un-escape.
     * @param strict If it should validate string content strictly.
     * @return Slice decoded as UTF_8 string handling escaped characters.
     */
    public static String javaUnEscape(CharSequence str, boolean strict) {
        final int     l   = str.length();
        StringBuilder out = new StringBuilder(l);

        char    surrogate = 0;
        boolean esc       = false;
        for (int i = 0; i < l; ++i) {
            char ch = str.charAt(i);
            if (surrogate > 0) {
                if (esc) {
                    if (ch == 'u') {
                        if (l < i + 5) {
                            if (strict) {
                                throw new IllegalArgumentException("Invalid escaped unicode char: '\\" +
                                                                   javaEscape(str.subSequence(i, l)) +
                                                                   "'");
                            }
                            out.append("��");
                        } else {
                            String n = str.subSequence(i + 1, i + 5).toString();
                            try {
                                ch = (char) Integer.parseInt(n, 16);
                                if (isLowSurrogate(ch)) {
                                    out.append(surrogate);
                                    out.append(ch);
                                } else if (strict) {
                                    throw new IllegalArgumentException(String.format(
                                            "Unmatched high surrogate char: '\\u%04x'",
                                            (int) surrogate));
                                } else {
                                    out.append("��");
                                }
                            } catch (NumberFormatException e) {
                                if (strict) {
                                    throw new IllegalArgumentException("Invalid escaped unicode char: '\\u" +
                                                                       javaEscape(n) +
                                                                       "'");
                                }
                                out.append("��");
                            }
                        }
                        i += 4;  // skipping 4 more characters.
                        surrogate = 0;
                        continue;
                    } else {
                        // mismatch
                        if (strict) {
                            throw new IllegalArgumentException(String.format(
                                    "Unmatched high surrogate char: '\\u%04x'",
                                    (int) surrogate));
                        }
                        out.append("�");
                        surrogate = 0;
                        // and fall down to 'normal' handling.
                    }
                } else if (ch == '\\') {
                    esc = true;
                    continue;
                } else if (isLowSurrogate(ch)) {
                    out.append(surrogate);
                    out.append(ch);
                    surrogate = 0;
                    continue;
                } else {
                    if (strict) {
                        throw new IllegalArgumentException(String.format(
                                "Unmatched high surrogate char: '\\u%04x'",
                                (int) surrogate));
                    }
                    out.append("�");
                    surrogate = 0;
                    // and fall down to 'normal' handling.
                }
            }
            if (esc) {
                esc = false;
                switch (ch) {
                    case 'b':
                        out.append('\b');
                        break;
                    case 'f':
                        out.append('\f');
                        break;
                    case 'n':
                        out.append('\n');
                        break;
                    case 'r':
                        out.append('\r');
                        break;
                    case 't':
                        out.append('\t');
                        break;
                    case '\"':
                    case '\'':
                    case '\\':
                        out.append(ch);
                        break;
                    case 'u':
                        if (l < i + 5) {
                            if (strict) {
                                throw new IllegalArgumentException("Invalid escaped unicode char: '\\" +
                                                                   javaEscape(str.subSequence(i, l)) +
                                                                   "'");
                            }
                            out.append('�');
                        } else {
                            String n = str.subSequence(i + 1, i + 5).toString();
                            try {
                                char cp = (char) Integer.parseInt(n, 16);
                                if (isHighSurrogate(cp)) {
                                    surrogate = cp;
                                } else if (isLowSurrogate(cp)) {
                                    if (strict) {
                                        throw new IllegalArgumentException(String.format(
                                                "Unmatched low surrogate char: '\\u%04x'",
                                                (int) cp));
                                    }
                                    out.append("�");
                                } else {
                                    out.append(cp);
                                }
                            } catch (NumberFormatException e) {
                                if (strict) {
                                    throw new IllegalArgumentException("Invalid escaped unicode char: '\\u" +
                                                                       javaEscape(n) +
                                                                       "'");
                                }
                                out.append('�');
                            }
                        }
                        i += 4;  // skipping 4 more characters.
                        break;
                    case '0':
                        if (l == i + 1 ||
                            (l > i + 1 && (str.charAt(i + 1) < '0' || str.charAt(i + 1) > '9'))) {
                            // allow single digit '\0' if the next char is not a digit.
                            out.append('\0');
                            break;
                        }
                        // Intentional fallthrough
                    case '1':
                        if (l < (i + 3)) {
                            if (strict) {
                                throw new IllegalArgumentException("Invalid escaped char: '\\" +
                                                                   javaEscape(str.subSequence(i, l)) +
                                                                   "'");
                            }
                            out.append('�');
                        } else {
                            String n = str.subSequence(i, i + 3).toString();
                            try {
                                int cp = Integer.parseInt(n, 8);
                                out.append((char) cp);
                            } catch (NumberFormatException e) {
                                if (strict) {
                                    throw new IllegalArgumentException("Invalid escaped char: '\\" +
                                                                       javaEscape(n) +
                                                                       "'");
                                }
                                out.append("�");
                            }
                        }
                        i += 2;  // skipping 2 more characters.
                        break;
                    default:
                        if (strict) {
                            throw new IllegalArgumentException("Invalid escaped char: '\\" +
                                                               javaEscape(String.valueOf(ch)) +
                                                               "'");
                        }
                        out.append("�");
                        break;
                }
            } else if (ch == '\\') {
                esc = true;
            } else if (isHighSurrogate(ch)) {
                surrogate = ch;
            } else if (isLowSurrogate(ch)) {
                // unmatched low surrogate
                if (strict) {
                    throw new IllegalArgumentException(String.format(
                            "Unmatched low surrogate char: '\\u%04x'",
                            (int) ch));
                }
                out.append('�');
            } else {
                out.append(ch);
            }
        }
        if (surrogate > 0) {
            if (strict) {
                throw new IllegalArgumentException(String.format(
                        "Unmatched high surrogate char: '\\u%04x'",
                        (int) surrogate));
            }
            out.append("�");
        }
        return out.toString();
    }

    private EscapeUtil() {}
}