Last active
September 14, 2018 18:57
-
-
Save thomaswilburn/e3ee6189bd5b1f51a1fbc7636b64a3de to your computer and use it in GitHub Desktop.
CSV iterators
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// test data -- we also use an identical "test.csv" to check streams | |
var csv = ` | |
one,two,three | |
1,2,3 | |
a,b,c | |
"1,000",1000,"hey there" | |
`.trim(); | |
// easy numeric indexes for iterables | |
var forEach = function*(iter) { | |
var i = 0; | |
for (var v of iter) yield [v, i++]; | |
}; | |
/* | |
This is the oddest--and most interesting--function of the script, even at only 8 lines. | |
We want to be able to read from a string one character at a time. However, we also need | |
to be able to peek ahead at the incoming tokens for parsing, so we need a generator that | |
can be "backed up." This is not traditionally a thing you can do with iterators, but | |
it's simple enough here: in JS, when a generator resumes after yielding, the first | |
argument to `iterator.next()` is used to evaluate in the place of the `yield` expression. | |
In this case, our peek function can pass in an offset to adjust where we are in the | |
string (`inc`). Since the generator resumes after yield, the order of this is a little | |
weird: | |
* initial run to yield | |
* on resume, set inc | |
* cycle to the "next" value and store it | |
* adjust position based on the inc value passed in | |
* yield the new value | |
Using the iterator this way means that our loops look a little different from traditional | |
for...of iteration, since we need to initialize and store the generator before the loop and | |
then independently call its `.next()` method. But if you think of these as a stream of | |
values, instead of a "loop" object, it's a lot easier to conceptualize. | |
*/ | |
var readChars = function*(input) { | |
var inc = 0; | |
for (var i = 0; i < input.length; i++) { | |
var c = input[i]; | |
i += inc; | |
inc = (yield c) || 0; | |
} | |
}; | |
// readLinesAsync is needed, because streams give us multi-character chunks as input | |
var readLinesAsync = async function*(input) { | |
var buffer = ""; | |
// wait for the chunk so we can split it | |
for await (var chunk of input) { | |
for (var c of readChars(chunk)) { | |
if (c == "\n") { | |
yield buffer; | |
buffer = ""; | |
} else { | |
buffer += c; | |
} | |
} | |
} | |
yield buffer; | |
}; | |
var readLines = function*(input) { | |
var buffer = ""; | |
for (var c of input) { | |
if (c == "\n") { | |
yield buffer; | |
buffer = ""; | |
} else { | |
buffer += c; | |
} | |
} | |
yield buffer; | |
}; | |
// parseLine() does the actual parsing, and yields a stream of cell values from a row | |
var parseLine = function*(line) { | |
var chars = readChars(line); | |
var quoting = false; | |
var buffer = ""; | |
for (var c of chars) { | |
switch (c) { | |
case `"`: | |
if (!quoting) { | |
quoting = true; | |
} else { | |
var peek = chars.next(-1).value; | |
if (peek && peek != ",") throw "Cell continued after quote character"; | |
quoting = false; | |
} | |
continue; | |
case ",": | |
if (!quoting) { | |
yield buffer; | |
buffer = ""; | |
continue; | |
} | |
default: | |
buffer += c; | |
} | |
} | |
if (quoting) throw "Unexpected end of line while still quoted"; | |
if (buffer.length) yield buffer; | |
}; | |
// convert values to primitive types | |
var cast = function(v) { | |
if (typeof v != "string") return v; | |
if (v == "true") return true; | |
if (v == "false") return false; | |
if (v.match(/^[\d.]+$/)) return parseFloat(v); | |
return v; | |
}; | |
// processLine assembles cells into a row, and is shared between sync and async code | |
// although I/O has to be handled asynchronously, we always have individual lines all at once | |
var processLine = function(line, options) { | |
if (options.header) { | |
var row = {}; | |
var parsing = parseLine(line); | |
for (var [cell, i] of forEach(parsing)) { | |
row[options.header[i]] = options.autoParse ? cast(cell) : cell; | |
} | |
return row; | |
} | |
// non-keyed rows | |
var row = [...parseLine(line)]; | |
if (options.autoParse) row = row.map(cast); | |
return row; | |
}; | |
// for streams, use this | |
var parseCSVAsync = async function*(input, options = { autoParse: true }) { | |
var lines = readLinesAsync(input); | |
if (options.header === true) { | |
var first = (await lines.next()).value; | |
options.header = [...parseLine(first)]; | |
} | |
for await (var line of lines) { | |
yield processLine(line, options); | |
} | |
}; | |
// for strings, use this | |
var parseCSV = function*(input, options = { autoParse: true }) { | |
var lines = readLines(input); | |
if (options.header === true) { | |
var first = lines.next().value; | |
options.header = [...parseLine(first)]; | |
} | |
for (var line of lines) { | |
yield processLine(line, options); | |
} | |
}; | |
console.log("=== With header, not streaming ==="); | |
for (var line of parseCSV(csv, { header: true, autoParse: true })) { | |
console.log(line); | |
} | |
(async function() { | |
var fs = require("fs"); | |
var stream = fs.createReadStream("test.csv", "utf-8"); | |
console.log("=== No header, streaming ==="); | |
for await (var line of parseCSVAsync(stream)) { | |
console.log(line) | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment