Added #116 Show invalid utf-8 bytes in Strings as {invalid_utf8:xxx}

This commit is contained in:
Jindra Petřík
2023-10-08 16:53:57 +02:00
parent 98b7ac100e
commit b6e8ca0d67
7 changed files with 162 additions and 7 deletions

View File

@@ -508,7 +508,10 @@ public class SWFInputStream implements AutoCloseable {
r = readEx();
if (r == 0) {
endDumpLevel();
return new String(baos.toByteArray(), swf == null ? Utf8Helper.charsetName : swf.getCharset());
if (swf == null || "UTF-8".equals(swf.getCharset())) {
return Utf8Helper.decode(baos.toByteArray());
}
return new String(baos.toByteArray(), swf.getCharset());
}
baos.write(r);
}

View File

@@ -80,6 +80,7 @@ import com.jpexs.decompiler.flash.types.shaperecords.SHAPERECORD;
import com.jpexs.decompiler.flash.types.shaperecords.StraightEdgeRecord;
import com.jpexs.decompiler.flash.types.shaperecords.StyleChangeRecord;
import com.jpexs.helpers.ByteArrayRange;
import com.jpexs.helpers.utf8.Utf8Helper;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
@@ -179,7 +180,12 @@ public class SWFOutputStream extends OutputStream {
* @throws IOException
*/
public void writeString(String value) throws IOException {
byte[] data = value.getBytes(charset);
byte[] data;
if ("UTF-8".equals(charset)) {
data = Utf8Helper.getBytes(value);
} else {
data = value.getBytes(charset);
}
for (int i = 0; i < data.length; i++) {
if (data[i] == 0) {
throw new IOException("String should not contain null character.");

View File

@@ -34,6 +34,7 @@ import com.jpexs.decompiler.flash.abc.types.traits.Traits;
import com.jpexs.decompiler.flash.dumpview.DumpInfo;
import com.jpexs.decompiler.flash.dumpview.DumpInfoSpecial;
import com.jpexs.decompiler.flash.dumpview.DumpInfoSpecialType;
import com.jpexs.helpers.Helper;
import com.jpexs.helpers.MemoryInputStream;
import com.jpexs.helpers.utf8.Utf8Helper;
import java.io.ByteArrayOutputStream;
@@ -523,8 +524,8 @@ public class ABCInputStream implements AutoCloseable {
stringDataBuffer = new byte[newLength];
}
safeRead(length, stringDataBuffer);
String r = new String(stringDataBuffer, 0, length, Utf8Helper.charset);
safeRead(length, stringDataBuffer);
String r = Utf8Helper.decode(stringDataBuffer, 0, length);
endDumpLevel(r);
return r;
}

View File

@@ -1757,4 +1757,5 @@ public class Helper {
}
return Integer.parseInt(version);
}
}

View File

@@ -18,6 +18,8 @@ package com.jpexs.helpers.utf8;
import com.jpexs.helpers.utf8.charset.Gb2312;
import com.jpexs.helpers.utf8.charset.ShiftJis;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
@@ -29,6 +31,8 @@ import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
@@ -98,13 +102,106 @@ public class Utf8Helper {
}
}
public static byte[] getBytes(String string) {
return string.getBytes(charset);
public static byte[] getBytes(String string) {
if (!string.contains("{invalid_utf8:")) {
return string.getBytes(charset);
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try {
Pattern invPattern = Pattern.compile("^(\\{invalid_utf8:([0-9]+)\\}).*", Pattern.DOTALL);
for (int i = 0; i < string.length(); i++) {
char c = string.charAt(i);
if (c == '{') {
String subStr = string.substring(i);
Matcher m = invPattern.matcher(subStr);
if (m.matches()) {
int v = Integer.parseInt(m.group(2));
baos.write(v);
i += m.group(1).length();
i--;
continue;
}
}
baos.write(("" + c).getBytes(charset));
}
} catch (IOException iex) {
//should not happen
}
return baos.toByteArray();
}
public static int getBytesLength(String string) {
// todo: make it faster without actually writing it to an array
return string.getBytes(charset).length;
return getBytes(string).length;
}
public static String decode(byte[] data) {
return decode(data, 0, data.length);
}
private static String escapeInvalidUtf8Char(int v) {
return "{invalid_utf8:" + v + "}";
}
public static String decode(byte[] data, int start, int length) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try {
for (int i = 0; i < length; i++) {
int v = data[i] & 0xff;
int numNextBytes = 0;
if ((v & 0x80) == 0) { //0xxxxxxx
baos.write(v);
} else if ((v & 0xC0) == 0x80) { //10xxxxxx
baos.write(escapeInvalidUtf8Char(v).getBytes("UTF-8"));
} else if ((v & 0xE0) == 0xC0) { //110xxxxx
numNextBytes = 1;
} else if ((v & 0xF0) == 0xE0) { //1110xxxx
numNextBytes = 2;
} else if ((v & 0xF8) == 0xF0) { //11110xxx
numNextBytes = 3;
} else {
baos.write(escapeInvalidUtf8Char(v).getBytes("UTF-8"));
}
if (numNextBytes > 0) {
if (i + numNextBytes >= length) {
baos.write(escapeInvalidUtf8Char(v).getBytes("UTF-8"));
continue;
}
boolean validNextBytes = true;
for (int j = 0; j < numNextBytes; j++) {
int v2 = data[i + 1 + j] & 0xff;
if ((v2 & 0xC0) != 0x80) { //must be 10xxxxxx
if ((v2 & 0x80) == 0) { //0xxxxxxx
numNextBytes = j;
}
validNextBytes = false;
break;
}
}
if (!validNextBytes) {
baos.write(escapeInvalidUtf8Char(v).getBytes("UTF-8"));
for (int j = 0; j < numNextBytes; j++) {
int v2 = data[i + 1 + j] & 0xff;
baos.write(escapeInvalidUtf8Char(v2).getBytes("UTF-8"));
}
} else {
baos.write(v);
for (int j = 0; j < numNextBytes; j++) {
int v2 = data[i + 1 + j] & 0xff;
baos.write(v2);
}
}
i += numNextBytes;
}
}
} catch (IOException ex) {
}
return new String(baos.toByteArray(), Utf8Helper.charset);
}
public static char codePointToChar(int codePoint, String charsetName) {