Skip to content

Instantly share code, notes, and snippets.

@SiZapPaaiGwat
Created March 28, 2024 08:35
Show Gist options
  • Save SiZapPaaiGwat/0ef3e8b745ba89877cf0dc1b9dbe4d86 to your computer and use it in GitHub Desktop.
Save SiZapPaaiGwat/0ef3e8b745ba89877cf0dc1b9dbe4d86 to your computer and use it in GitHub Desktop.
bigJsonParse - JSON.parse alternative for big json files
import fs from "node:fs";
function bigJsonParse(
jsonString,
{
maxArrayItemThershold = Number.MAX_SAFE_INTEGER,
recordOversizedArrayThershold = 50,
}
) {
let oversizedArraysPaths = []; // 用于存储超出长度限制的数组路径
// 解析给定的JSON字符串片段
function parseValue(segment, currentPath = "") {
segment = segment.trim(); // 去除字符串首尾的空白字符
// 根据字符串的起始字符判断该如何解析
if (segment.startsWith("{")) {
return parseObject(segment, currentPath);
} else if (segment.startsWith("[")) {
return parseArray(segment, currentPath);
} else if (segment.startsWith('"')) {
// 解析字符串,去除两边的引号
return segment.slice(1, segment.lastIndexOf('"'));
} else {
// 直接解析数字或布尔值
return JSON.parse(segment);
}
}
// 解析数组,考虑嵌套和超出长度限制的情况
function parseArray(segment, currentPath) {
const array = [];
let itemBuffer = ""; // 用于缓存当前解析项的字符
let depth = 0; // 用于跟踪嵌套的深度
let isInString = false; // 标记是否处于字符串内
let escapeChar = false; // 标记转义字符
// 逐字符解析数组内容
for (let i = 1; i < segment.length - 1; i++) {
const char = segment[i];
// 处理转义字符
if (escapeChar) {
itemBuffer += char;
escapeChar = false;
continue;
}
// 进入或退出字符串
if (char === '"' && !isInString) {
isInString = true;
} else if (char === '"' && isInString) {
isInString = false;
} else if (char === "\\" && isInString) {
escapeChar = true;
}
// 不在字符串中时,处理嵌套和项分隔符
if (!isInString) {
if (char === "[" || char === "{") {
depth++;
} else if (char === "]" || char === "}") {
depth--;
} else if (char === "," && depth === 0) {
// 解析并添加当前项
array.push(
parseValue(itemBuffer, `${currentPath}[${array.length - 1}]`)
);
itemBuffer = "";
if (array.length >= maxArrayItemThershold) {
// 提前结束解析
break;
}
continue;
}
}
itemBuffer += char; // 将字符添加到当前项缓存中
}
// 解析并添加最后一项(如果有)
if (itemBuffer && array.length < maxArrayItemThershold) {
array.push(parseValue(itemBuffer, `${currentPath}[${array.length - 1}]`));
}
if (array.length > recordOversizedArrayThershold) {
oversizedArraysPaths.push(currentPath);
}
return array;
}
// 解析对象,考虑属性值的嵌套
function parseObject(segment, currentPath) {
const obj = {};
let key = "";
let value = "";
let isInString = false; // 标记是否处于字符串内
let isParsingKey = true; // 标记当前是否在解析键
let braces = 0; // 用于跟踪嵌套的深度
// 逐字符解析对象内容
for (let i = 1; i < segment.length - 1; i++) {
const char = segment[i];
// 进入或退出字符串
if (char === '"' && segment[i - 1] !== "\\") {
isInString = !isInString;
} else if ((char === "{" || char === "[") && !isInString) {
braces++;
} else if ((char === "}" || char === "]") && !isInString) {
braces--;
}
// 处理键值对的分隔和项分隔符
if (char === ":" && braces === 0 && !isInString) {
isParsingKey = false;
} else if (char === "," && braces === 0 && !isInString) {
// 解析并添加当前键值对
const newPath = currentPath
? `${currentPath}.${key.trim().slice(1, -1)}`
: key.trim().slice(1, -1);
obj[key.trim().slice(1, -1)] = parseValue(value, newPath);
key = "";
value = "";
isParsingKey = true;
} else {
if (isParsingKey) {
key += char; // 添加字符到键
} else {
value += char; // 添加字符到值
}
}
}
// 解析并添加最后一个键值对(如果有)
if (key && value) {
const newPath = currentPath
? `${currentPath}.${key.trim().slice(1, -1)}`
: key.trim().slice(1, -1);
obj[key.trim().slice(1, -1)] = parseValue(value, newPath);
}
return obj;
}
// 解析入口点
const result = parseValue(jsonString);
// 返回解析结果和超长数组路径
return { result, oversizedArraysPaths };
}
const { result, oversizedArraysPaths } = bigJsonParse(
fs.readFileSync("example.json", "utf8")
);
fs.writeFileSync("out.json", JSON.stringify(result, null, 2));
console.log(oversizedArraysPaths);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment