Skip to content

Instantly share code, notes, and snippets.

@ggoodman
Created September 4, 2024 14:45
Show Gist options
  • Save ggoodman/c4e72b3bc11d35207ab21755b4af18f8 to your computer and use it in GitHub Desktop.
Save ggoodman/c4e72b3bc11d35207ab21755b4af18f8 to your computer and use it in GitHub Desktop.
Streaming JSON decoder POC
// @ts-check
const CURLY_OPEN = '{'.charCodeAt(0);
const CURLY_CLOSE = '}'.charCodeAt(0);
const SQUARE_OPEN = '['.charCodeAt(0);
const SQUARE_CLOSE = ']'.charCodeAt(0);
const DOUBLE_QUOTE = '"'.charCodeAt(0);
const CHARACTER_COLON = ':'.charCodeAt(0);
const CHARACTER_COMMA = ','.charCodeAt(0);
const CHARACTER_BACKSLASH = '\\'.charCodeAt(0);
const CHARACTER_LOWER_E = 'e'.charCodeAt(0);
const CHARACTER_LOWER_F = 'f'.charCodeAt(0);
const CHARACTER_LOWER_N = 'n'.charCodeAt(0);
const CHARACTER_LOWER_T = 't'.charCodeAt(0);
const CHARACTER_UPPER_E = 'E'.charCodeAt(0);
const CHARACTER_0 = '0'.charCodeAt(0);
const CHARACTER_9 = '9'.charCodeAt(0);
const CHARACTER_DECIMAL = '.'.charCodeAt(0);
const CHARACTER_PLUS = '+'.charCodeAt(0);
const CHARACTER_MINUS = '-'.charCodeAt(0);
const WHITESPACE_SPACE = 0x20;
const WHITESPACE_HORIZONTAL_TAB = 0x09;
const WHITESPACE_NL = 0x0a;
const WHITESPACE_CR = 0x0d;
const WRITE_REJECTED = tagLiteral(0, 'writeResult');
const WRITE_ACCEPTED = tagLiteral(1, 'writeResult');
const WRITE_COMPLETE = tagLiteral(2, 'writeResult');
/** @typedef {typeof WRITE_REJECTED | typeof WRITE_ACCEPTED | typeof WRITE_COMPLETE} WriteResult */
/**
* @template T
* @template {string} Tag
* @param {T} val
* @param {Tag} tagName
* @returns {T & { __tag: Tag }}
*/
function tagLiteral(val, tagName) {
return /** @type {T & { __tag: Tag }} */ (val);
}
const STATE_INITIAL = tagLiteral(0, 'state');
const STATE_CONSUMING = tagLiteral(1, 'state');
const STATE_COMPLETE = tagLiteral(2, 'state');
/** @typedef {typeof STATE_INITIAL | typeof STATE_CONSUMING | typeof STATE_COMPLETE} ParserState */
/**
* @implements {TypedDecoder}
*/
class ValueDecoder {
/**
* @type {TypedDecoder | null}
*/
#typedDecoder = null;
/** @type {ParserState} */
#state = STATE_INITIAL;
#bytesWritten = 0;
write(byte) {
this.#bytesWritten++;
switch (this.#state) {
case STATE_INITIAL: {
switch (byte) {
case CURLY_OPEN: {
this.#typedDecoder = new ObjectDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
case SQUARE_OPEN: {
this.#typedDecoder = new ArrayDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
case DOUBLE_QUOTE: {
this.#typedDecoder = new StringDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
case CHARACTER_LOWER_F: {
this.#typedDecoder = new FalseDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
case CHARACTER_LOWER_N: {
this.#typedDecoder = new NullDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
case CHARACTER_LOWER_T: {
this.#typedDecoder = new TrueDecoder();
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
}
if (
byte === CHARACTER_MINUS ||
byte === CHARACTER_PLUS ||
(byte >= CHARACTER_0 && byte <= CHARACTER_9)
) {
this.#typedDecoder = new NumberDecoder();
this.#typedDecoder.write(byte);
this.#state = STATE_CONSUMING;
return WRITE_ACCEPTED;
}
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
throw new Error(`Unexpected character: ${String.fromCharCode(byte)}`);
}
case STATE_CONSUMING: {
invariant(this.#typedDecoder, 'Missing a typed decoder');
const decoderResult = this.#typedDecoder.write(byte);
if (decoderResult === WRITE_COMPLETE) {
this.#state = STATE_COMPLETE;
} else if (decoderResult === WRITE_REJECTED) {
// The NumberDecoder rejected the write suggesting the potential end of the number.
// Let's mark decoding as completed and allow a parent decoder to do the back-tracking.
this.#state = STATE_COMPLETE;
}
return decoderResult;
}
case STATE_COMPLETE: {
if (!isWhitespace(byte)) {
throw new Error('Expected whitespace');
}
return WRITE_ACCEPTED;
}
}
throw new Error('Unexpected character');
}
/**
*
* @returns {unknown}
*/
finalize() {
invariant(this.#typedDecoder, 'Attempting to finalize without a typed decoder');
return this.#typedDecoder.finalize();
}
}
/**
* @implements {TypedDecoder<false>}
*/
class FalseDecoder {
static #expect = ['a'.charCodeAt(0), 'l'.charCodeAt(0), 's'.charCodeAt(0), 'e'.charCodeAt(0)];
#nextIndex = 0;
write(byte) {
if (this.#nextIndex >= FalseDecoder.#expect.length) {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
throw new Error(`Unexpected character, got ${byte}`);
}
if (this.#nextIndex === 0 && isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
const expectedByte = FalseDecoder.#expect[this.#nextIndex];
this.#nextIndex++;
if (byte !== expectedByte) {
throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`);
}
if (this.#nextIndex === FalseDecoder.#expect.length) {
return WRITE_COMPLETE;
}
return WRITE_ACCEPTED;
}
/**
*
* @returns {false}
*/
finalize() {
return false;
}
}
/**
* @implements {TypedDecoder<null>}
*/
class NullDecoder {
static #expect = ['u'.charCodeAt(0), 'l'.charCodeAt(0), 'l'.charCodeAt(0)];
#nextIndex = 0;
write(byte) {
if (this.#nextIndex >= NullDecoder.#expect.length) {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
throw new Error(`Unexpected character, got ${byte}`);
}
if (this.#nextIndex === 0 && isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
const expectedByte = NullDecoder.#expect[this.#nextIndex];
this.#nextIndex++;
if (byte !== expectedByte) {
throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`);
}
if (this.#nextIndex === NullDecoder.#expect.length) {
return WRITE_COMPLETE;
}
return WRITE_ACCEPTED;
}
/**
*
* @returns {null}
*/
finalize() {
return null;
}
}
/**
* @implements {TypedDecoder<true>}
*/
class TrueDecoder {
static #expect = ['r'.charCodeAt(0), 'u'.charCodeAt(0), 'e'.charCodeAt(0)];
#nextIndex = 0;
write(byte) {
if (this.#nextIndex >= TrueDecoder.#expect.length) {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
throw new Error(`Unexpected character, got ${byte}`);
}
if (this.#nextIndex === 0 && isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
const expectedByte = TrueDecoder.#expect[this.#nextIndex];
this.#nextIndex++;
if (byte !== expectedByte) {
throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`);
}
if (this.#nextIndex === TrueDecoder.#expect.length) {
return WRITE_COMPLETE;
}
return WRITE_ACCEPTED;
}
/**
*
* @returns {true}
*/
finalize() {
return true;
}
}
/**
* @implements {TypedDecoder<string>}
*/
class StringDecoder {
/**
* @readonly
* @type {Array<number>}
*/
#bytes = [];
#escaped = false;
/**
* @type {ParserState}
*/
#state = STATE_CONSUMING;
write(byte) {
invariant(
this.#state === STATE_CONSUMING,
'Must not write bytes to a StringDecoder after it has finalized'
);
// End of the string
if (!this.#escaped && byte === DOUBLE_QUOTE) {
this.#state = STATE_COMPLETE;
this.#escaped = false; // Probably unnecessary
return WRITE_COMPLETE;
}
if (!this.#escaped && byte === CHARACTER_BACKSLASH) {
this.#escaped = true;
} else {
this.#escaped = false;
}
this.#bytes.push(byte);
return WRITE_ACCEPTED;
}
finalize() {
invariant(
this.#state === STATE_COMPLETE,
"Attempting to finalize a string that isn't in the complete state"
);
const decoder = new TextDecoder('utf-8');
const buf = Uint8Array.from(this.#bytes);
const str = decoder.decode(buf);
const parsed = JSON.parse(`"${str}"`);
invariant(typeof parsed === 'string', 'Finalizing a string resulted in a non-string type');
return parsed;
}
}
const ARRAY_STATE_BEFORE_VALUE = tagLiteral(0, 'arrayState');
const ARRAY_STATE_VALUE = tagLiteral(1, 'arrayState');
const ARRAY_STATE_AFTER_VALUE = tagLiteral(2, 'arrayState');
/** @typedef {typeof ARRAY_STATE_BEFORE_VALUE | typeof ARRAY_STATE_VALUE | typeof ARRAY_STATE_AFTER_VALUE} ArrayParserState */
/**
* @implements {TypedDecoder<Array<unknown>>}
*/
class ArrayDecoder {
/**
* @readonly
* @type {Array<unknown>}
*/
#arr = [];
/**
* @type {ArrayParserState}
*/
#arrayState = ARRAY_STATE_BEFORE_VALUE;
/**
* @type {ParserState}
*/
#state = STATE_CONSUMING;
/**
* @type {ValueDecoder | null}
*/
#valueDecoder = new ValueDecoder();
write(byte) {
invariant(
this.#state === STATE_CONSUMING,
'Invalid to call write() when not in consuming state'
);
switch (this.#arrayState) {
case ARRAY_STATE_BEFORE_VALUE: {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
if (byte === SQUARE_CLOSE) {
this.#state = STATE_COMPLETE;
return WRITE_COMPLETE;
}
this.#arrayState = ARRAY_STATE_VALUE;
this.#valueDecoder = new ValueDecoder();
this.#valueDecoder.write(byte);
return WRITE_ACCEPTED;
}
case ARRAY_STATE_VALUE: {
invariant(this.#valueDecoder, 'Expected to have a value decoder while in value state');
const ret = this.#valueDecoder.write(byte);
if (ret === WRITE_COMPLETE) {
this.#arr.push(this.#valueDecoder.finalize());
this.#valueDecoder = null;
this.#arrayState = ARRAY_STATE_AFTER_VALUE;
return WRITE_ACCEPTED;
} else if (ret === WRITE_ACCEPTED) {
return WRITE_ACCEPTED;
}
this.#arr.push(this.#valueDecoder.finalize());
this.#valueDecoder = null;
this.#arrayState = ARRAY_STATE_AFTER_VALUE;
// Intentionally allow ourselves to fall through here because the
// ValueDecoder didn't accept the value.
}
case ARRAY_STATE_AFTER_VALUE: {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
switch (byte) {
case SQUARE_CLOSE: {
// End of the object
this.#state = STATE_COMPLETE;
return WRITE_COMPLETE;
}
case CHARACTER_COMMA: {
this.#arrayState = ARRAY_STATE_BEFORE_VALUE;
return WRITE_ACCEPTED;
}
}
}
}
throw new Error('Unexpected character');
}
finalize() {
invariant(
this.#state === STATE_COMPLETE,
"Attempting to finalize a array that isn't in the complete state"
);
return this.#arr;
}
}
const NUMBER_STATE_SIGN = tagLiteral(0, 'numberState');
const NUMBER_STATE_INT = tagLiteral(1, 'numberState');
const NUMBER_STATE_FRACTION = tagLiteral(2, 'numberState');
const NUMBER_STATE_EXPONENT = tagLiteral(3, 'numberState');
/** @typedef {typeof NUMBER_STATE_SIGN | typeof NUMBER_STATE_INT | typeof NUMBER_STATE_FRACTION | typeof NUMBER_STATE_EXPONENT} NumberParserState */
/**
* @implements {TypedDecoder<number>}
*/
class NumberDecoder {
/**
* @readonly
* @type {Array<number>}
*/
#bytes = [];
/**
* @type {ParserState}
*/
#state = STATE_CONSUMING;
/**
* @type {NumberParserState}
*/
#numberState = NUMBER_STATE_SIGN;
write(byte) {
invariant(
this.#state === STATE_CONSUMING,
'Invalid to call write() when not in consuming state'
);
switch (this.#numberState) {
case NUMBER_STATE_SIGN: {
// Sign or no sign, we're going to be in the next state after this character.
this.#numberState = NUMBER_STATE_INT;
if (byte === CHARACTER_MINUS || byte === CHARACTER_PLUS) {
this.#bytes.push(byte);
return WRITE_ACCEPTED;
}
// Intentionally fall-through since we didn't see a sign byte.
}
case NUMBER_STATE_INT: {
if (byte >= CHARACTER_0 && byte <= CHARACTER_9) {
this.#bytes.push(byte);
return WRITE_ACCEPTED;
}
if (byte === CHARACTER_DECIMAL) {
this.#bytes.push(byte);
this.#numberState = NUMBER_STATE_FRACTION;
return WRITE_ACCEPTED;
}
break;
}
case NUMBER_STATE_FRACTION: {
if (byte >= CHARACTER_0 && byte <= CHARACTER_9) {
this.#bytes.push(byte);
return WRITE_ACCEPTED;
}
if (byte === CHARACTER_LOWER_E || byte === CHARACTER_UPPER_E) {
this.#bytes.push(byte);
this.#numberState = NUMBER_STATE_EXPONENT;
return WRITE_ACCEPTED;
}
break;
}
case NUMBER_STATE_EXPONENT: {
if (byte >= CHARACTER_0 && byte <= CHARACTER_9) {
this.#bytes.push(byte);
return WRITE_ACCEPTED;
}
break;
}
}
this.#state = STATE_COMPLETE;
return WRITE_REJECTED;
}
finalize() {
// invariant(
// this.#state === STATE_COMPLETE,
// 'Invalid to call finalize() when not in complete state'
// );
// Since numbers have no terminal token, we allow them to be finalized any time.
const decoder = new TextDecoder('utf-8');
const buf = Uint8Array.from(this.#bytes);
const str = decoder.decode(buf);
const parsed = JSON.parse(str);
invariant(
typeof parsed === 'number' && Number.isFinite(parsed) && !Number.isNaN(parsed),
'Invalid number'
);
return parsed;
}
}
const OBJECT_STATE_BEFORE_KEY = tagLiteral(0, 'objectState');
const OBJECT_STATE_KEY = tagLiteral(1, 'objectState');
const OBJECT_STATE_AFTER_KEY = tagLiteral(2, 'objectState');
const OBJECT_STATE_VALUE = tagLiteral(3, 'objectState');
const OBJECT_STATE_AFTER_VALUE = tagLiteral(4, 'objectState');
/** @typedef {typeof OBJECT_STATE_BEFORE_KEY | typeof OBJECT_STATE_KEY | typeof OBJECT_STATE_AFTER_KEY | typeof OBJECT_STATE_VALUE | typeof OBJECT_STATE_AFTER_VALUE } ObjectParserState */
/**
* @implements {TypedDecoder<Record<string, unknown>>}
*/
class ObjectDecoder {
/**
* @readonly
* @type {Record<string, unknown>}
*/
#obj = {};
/**
* @type {ObjectParserState}
*/
#objectState = OBJECT_STATE_BEFORE_KEY;
/**
* @type {ParserState}
*/
#state = STATE_CONSUMING;
/**
* @type {StringDecoder | null}
*/
#keyDecoder = null;
/**
* @type {string | null}
*/
#key = null;
/**
* @type {ValueDecoder | null}
*/
#valueDecoder = new ValueDecoder();
write(byte) {
invariant(
this.#state === STATE_CONSUMING,
'Invalid to call write() when not in consuming state'
);
switch (this.#objectState) {
case OBJECT_STATE_BEFORE_KEY: {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
if (byte === CURLY_CLOSE) {
this.#state = STATE_COMPLETE;
return WRITE_COMPLETE;
}
if (byte !== DOUBLE_QUOTE) {
throw new Error('Expecting double quote');
}
this.#objectState = OBJECT_STATE_KEY;
this.#keyDecoder = new StringDecoder();
return WRITE_ACCEPTED;
}
case OBJECT_STATE_KEY: {
invariant(this.#keyDecoder, 'Expected to have a string decoder while in key parsing mode');
const keyState = this.#keyDecoder.write(byte);
if (keyState === WRITE_COMPLETE) {
this.#key = this.#keyDecoder.finalize();
this.#keyDecoder = null;
this.#objectState = OBJECT_STATE_AFTER_KEY;
}
return WRITE_ACCEPTED;
}
case OBJECT_STATE_AFTER_KEY: {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
if (byte !== CHARACTER_COLON) {
throw new Error("Expected ':'");
}
this.#objectState = OBJECT_STATE_VALUE;
this.#valueDecoder = new ValueDecoder();
return WRITE_ACCEPTED;
}
case OBJECT_STATE_VALUE: {
invariant(
this.#valueDecoder,
'Expected to have a value decoder while in value parsing mode'
);
const ret = this.#valueDecoder.write(byte);
if (ret === WRITE_COMPLETE) {
invariant(this.#key != null, 'Expected to have a stored key');
this.#obj[this.#key] = this.#valueDecoder.finalize();
this.#valueDecoder = null;
this.#objectState = OBJECT_STATE_AFTER_VALUE;
return WRITE_ACCEPTED;
} else if (ret === WRITE_ACCEPTED) {
return WRITE_ACCEPTED;
}
// The value decoder didn't accept the character. That we means we need
// to finalize it and fall through.
invariant(this.#key != null, 'Expected to have a stored key');
this.#obj[this.#key] = this.#valueDecoder.finalize();
this.#valueDecoder = null;
this.#objectState = OBJECT_STATE_AFTER_VALUE;
// Intentionally fall through
}
case OBJECT_STATE_AFTER_VALUE: {
if (isWhitespace(byte)) {
return WRITE_ACCEPTED;
}
switch (byte) {
case CURLY_CLOSE: {
// End of the object
this.#state = STATE_COMPLETE;
return WRITE_COMPLETE;
}
case CHARACTER_COMMA: {
this.#objectState = OBJECT_STATE_BEFORE_KEY;
return WRITE_ACCEPTED;
}
}
}
}
throw new Error('Unexpected character');
}
finalize() {
invariant(
this.#state === STATE_COMPLETE,
"Attempting to finalize a object that isn't in the complete state"
);
return this.#obj;
}
}
function isWhitespace(byte) {
switch (byte) {
case WHITESPACE_CR:
case WHITESPACE_HORIZONTAL_TAB:
case WHITESPACE_NL:
case WHITESPACE_SPACE:
return true;
}
return false;
}
/**
* Check an assertion for truthiness in a way that TypeScript's flow control will
* be able to reason about.
*
* @param {unknown} test
* @param {string} message
* @returns {asserts test}
*/
function invariant(test, message) {
if (!test) {
throw new Error(`Invariant violation: ${message}`);
}
}
/**
* @template [DecodedValue=unknown]
* @typedef TypedDecoder
* @property {(byte: number) => WriteResult} write
* @property {() => DecodedValue} finalize
*/
/* -------------------- TESTS -------------------- */
const assert = require('assert');
const { createReadStream, readFileSync } = require('fs');
/**
*
* @param {Uint8Array} buf
* @param {unknown} wantedValue
*/
function test(buf, wantedValue) {
const valueDecoder = new ValueDecoder();
let lastResult = 0;
for (let i = 0; i < buf.byteLength; i++) {
try {
lastResult = valueDecoder.write(buf[i]);
} catch (err) {
console.error('Parsing failed', {
offset: i,
prelude: new TextDecoder().decode(buf.slice(0, i)),
decoder: valueDecoder,
});
throw err;
}
}
let finalValue;
try {
finalValue = valueDecoder.finalize();
} catch (err) {
console.error('Finalize failed', {
decoder: valueDecoder,
});
throw err;
}
assert.deepEqual(finalValue, wantedValue);
}
const textEncoder = new TextEncoder();
const numbers = [1, -1, 1.0, -1.0, 1.0e10, -1.0e10];
for (const number of numbers) {
test(textEncoder.encode(JSON.stringify(number)), number);
}
test(textEncoder.encode(`${JSON.stringify(true)}`), true);
test(textEncoder.encode(`${JSON.stringify(false)}`), false);
test(textEncoder.encode(`${JSON.stringify(null)}`), null);
const strings = [
'hello world',
'Hello with some \\ escape characters and fake terminals ".',
'',
' \t\r\n ',
];
for (const string of strings) {
test(textEncoder.encode(JSON.stringify(string)), string);
}
const objects = [{ 'hello world': false }, { nested: { object: true } }];
for (const object of objects) {
test(textEncoder.encode(JSON.stringify(object)), object);
test(textEncoder.encode(JSON.stringify(object, null, 2)), object);
test(textEncoder.encode(JSON.stringify(object, null, '\t')), object);
}
const arrays = [[{ 'hello world': false }, { nested: { object: true } }], 12, 1e8, -9.000001];
for (const arr of arrays) {
test(textEncoder.encode(JSON.stringify(arr)), arr);
test(textEncoder.encode(JSON.stringify(arr, null, 2)), arr);
test(textEncoder.encode(JSON.stringify(arr, null, '\t')), arr);
}
const maxIters = 10000;
const start = Date.now();
const canadaJsonBytes = readFileSync('./Canada.json');
invariant(Buffer.isBuffer(canadaJsonBytes), 'must be a buffer');
const canadaJson = JSON.parse(canadaJsonBytes.toString());
console.log('Initial parse', Date.now() - start);
const iterStart = Date.now();
for (let iter = 0; iter < maxIters; iter++) {
const d = new ValueDecoder();
for (let i = 0; i < canadaJsonBytes.byteLength; i++) {
try {
d.write(canadaJsonBytes[i]);
} catch (err) {
console.error('Parsing failed', {
offset: i,
prelude: new TextDecoder().decode(canadaJsonBytes.slice(0, i)),
decoder: d,
});
throw err;
}
}
d.finalize();
console.log('Average cycle time after %d iters', iter, (Date.now() - iterStart) / (iter + 1));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment