Skip to content

Commit

Permalink
Port convert_UTF8_to_JSON from C
Browse files Browse the repository at this point in the history
Also includes updated logic for generate (generate_json_string)
based on current C code.

Original code by @byroot

See ruby#620
  • Loading branch information
headius committed Jan 9, 2025
1 parent 4d37e9f commit 38c7831
Showing 1 changed file with 95 additions and 11 deletions.
106 changes: 95 additions & 11 deletions java/src/json/ext/StringEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;

/**
* An encoder that reads from the given source and outputs its representation
Expand Down Expand Up @@ -46,6 +47,15 @@ final class StringEncoder extends ByteListTranscoder {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static final byte[] ASCII_ONLY_ESCAPE_TABLE = {
Expand Down Expand Up @@ -97,6 +107,8 @@ final class StringEncoder extends ByteListTranscoder {
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
private static final byte[] BACKSLASH_U2028 = "\\u2028".getBytes(StandardCharsets.US_ASCII);
private static final byte[] BACKSLASH_U2029 = "\\u2029".getBytes(StandardCharsets.US_ASCII);

private final boolean asciiOnly, scriptSafe;

Expand Down Expand Up @@ -143,10 +155,12 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
append('"');
switch (object.scanForCodeRange()) {
case StringSupport.CR_7BIT:
encodeASCII(context, byteList, buffer);
break;
case StringSupport.CR_VALID:
encode(context, byteList, buffer);
if (asciiOnly) {
encodeASCII(byteList, scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE);
} else {
encode(byteList, scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE);
}
break;
default:
throw Utils.buildGeneratorError(context, object, "source sequence is illegal/malformed utf-8").toThrowable();
Expand Down Expand Up @@ -178,15 +192,85 @@ static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
return str;
}

void encode(ThreadContext context, ByteList src, OutputStream out) throws IOException {
while (hasNext()) {
handleChar(readUtf8Char(context));
// C: convert_UTF8_to_JSON
void encode(ByteList src, byte[] escape_table) throws IOException {
byte[] hexdig = HEX;
byte[] scratch = aux;

byte[] ptrBytes = src.unsafeBytes();
int ptr = src.begin();
int len = src.realSize();

int beg = 0;
int pos = 0;

while (pos < len) {
int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
int ch_len = escape_table[ch];
/* JSON encoding */

if (ch_len > 0) {
switch (ch_len) {
case 9: {
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
switch (ch) {
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
case '/': appendEscape(BACKSLASH_FORWARDSLASH); break;
case '\b': appendEscape(BACKSLASH_B); break;
case '\f': appendEscape(BACKSLASH_F); break;
case '\n': appendEscape(BACKSLASH_N); break;
case '\r': appendEscape(BACKSLASH_R); break;
case '\t': appendEscape(BACKSLASH_T); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
append(scratch, 0, 6);
break;
}
}
break;
}
case 11: {
int b2 = Byte.toUnsignedInt(ptrBytes[ptr + pos + 1]);
if (b2 == 0x80) {
int b3 = Byte.toUnsignedInt(ptrBytes[ptr + pos + 2]);
if (b3 == 0xA8) {
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 3);
append(BACKSLASH_U2028, 0, 6);
break;
} else if (b3 == 0xA9) {
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 3);
append(BACKSLASH_U2029, 0, 6);
break;
}
}
ch_len = 3;
// fallthrough
}
default:
pos += ch_len;
break;
}
} else {
pos++;
}
}

if (beg < len) {
append(ptrBytes, ptr + beg, len - beg);
}
}

private int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); }
return pos + size;
}

// C: convert_UTF8_to_ASCII_only_JSON
void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws IOException {
byte[] escape_table = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE;
void encodeASCII(ByteList src, byte[] escape_table) throws IOException {
byte[] hexdig = HEX;
byte[] scratch = aux;

Expand All @@ -198,13 +282,13 @@ void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws I
int pos = 0;

while (pos < len) {
byte ch = ptrBytes[ptr + pos];
int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
int ch_len = escape_table[ch];

if (ch_len != 0) {
switch (ch_len) {
case 9: {
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); } pos += 1; beg = pos; // FLUSH_POS
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
switch (ch) {
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
Expand Down Expand Up @@ -245,7 +329,7 @@ void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws I
wchar = (wchar << 6) | (ptrBytes[ptr + pos +i] & 0x3F);
}

if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); } pos += ch_len; beg = pos; // FLUSH_POS
beg = pos = flushPos(pos, beg, ptrBytes, ptr, ch_len);

if (wchar <= 0xFFFF) {
scratch[2] = hexdig[wchar >> 12];
Expand Down

0 comments on commit 38c7831

Please sign in to comment.