max-mapper · May 9, 2021 02:20 · joeybaker · Sep 13, 2013 · baudehlo · Sep 13, 2013
diff --git a/index.js b/index.js
 // data comes from here http://stat-computing.org/dataexpo/2009/the-data.html
 // download 1994.csv.bz2 and unpack by running: cat 1994.csv.bz2 | bzip2 -d > 1994.csv
 // 1994.csv should be ~5.2 million lines and 500MB
 // importing all rows into leveldb took ~50 seconds on my machine

 // there are two main techniques at work here:
 // 1: never create JS objects, leave the data as binary the entire time (binary-split does this)
 // 2: group lines into 16 MB batches, to take advantage of leveldbs batch API (byte-stream does this)

 var level = require('level')
 var byteStream = require('byte-stream')
 var split = require('binary-split')
 var fs = require('fs')

 var count = 0
 var wbs = 1024 * 1024 * 16
 var db = level('data.db', {writeBufferSize: wbs}, function(){
  var batcher = byteStream(wbs)
  fs.createReadStream('1994.csv')
    .pipe(split())
    .pipe(batcher)
    .on('data', function(lines) {
      var batch = db.batch()
      for (var i = 0; i < lines.length; i++) {
        batch.put(count, lines[i])
        count++
      }
      batch.write(batcher.next.bind(batcher))
    })
 })
	// data comes from here http://stat-computing.org/dataexpo/2009/the-data.html
	// download 1994.csv.bz2 and unpack by running: cat 1994.csv.bz2 \| bzip2 -d > 1994.csv
	// 1994.csv should be ~5.2 million lines and 500MB
	// importing all rows into leveldb took ~50 seconds on my machine

	// there are two main techniques at work here:
	// 1: never create JS objects, leave the data as binary the entire time (binary-split does this)
	// 2: group lines into 16 MB batches, to take advantage of leveldbs batch API (byte-stream does this)

	var level = require('level')
	var byteStream = require('byte-stream')
	var split = require('binary-split')
	var fs = require('fs')

	var count = 0
	var wbs = 1024 * 1024 * 16
	var db = level('data.db', {writeBufferSize: wbs}, function(){
	var batcher = byteStream(wbs)
	fs.createReadStream('1994.csv')
	.pipe(split())
	.pipe(batcher)
	.on('data', function(lines) {
	var batch = db.batch()
	for (var i = 0; i < lines.length; i++) {
	batch.put(count, lines[i])
	count++
	}
	batch.write(batcher.next.bind(batcher))
	})
	})