Last active
March 20, 2018 23:02
-
-
Save nknapp/ca76edbaf2871c729409e593e616fa8b to your computer and use it in GitHub Desktop.
searching phashs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Precompute the number of matching bits by the xor-result of two bytes | |
const matchingBitsFromXor = [] | |
for (let i = 0; i < 256; i++) { | |
matchingBitsFromXor[i] = countMatchingBits(i) | |
} | |
class PhashBuffer { | |
constructor (bytesPerHash, maxSize) { | |
this.buffer = Buffer.alloc(bytesPerHash * maxSize) | |
this.next = 0 | |
this.bytesPerHash = bytesPerHash | |
this.maxSize = maxSize | |
} | |
add (phash) { | |
if (this.next >= this.maxSize) { | |
throw new Error(`Index out of bounds (max: ${this.maxSize}): ${this.next}`) | |
} | |
this.buffer.write(phash, this.next * this.bytesPerHash, this.bytesPerHash, 'hex') | |
this.next++ | |
} | |
get (index) { | |
return this.buffer.toString('hex', index * 9, (index + 1) * 9) | |
} | |
/** | |
* Hamming distance between a value (Buffer) and a value stored at an index in the this | |
* PhashBuffer | |
* @param {number} compareIndex | |
* @param {Buffer} value | |
* @return {number} | |
*/ | |
hammingDist (compareIndex, value) { | |
let matchingBits = 0 | |
for (let i = 0; i < this.bytesPerHash; i++) { | |
let byte1 = value[i] | |
let byte2 = this.buffer[compareIndex * this.bytesPerHash + i] | |
let xor = byte1 ^ byte2 | |
matchingBits += matchingBitsFromXor[xor] | |
// console.log(byte1.toString(2), byte2.toString(2), xor.toString(2), matchingBitsFromXor[xor], matchingBits) | |
} | |
return matchingBits | |
} | |
/** | |
* Search for similar values in the buffer | |
* @param {string} phash hex-encoded phash | |
* @param {number} threshold minimum number of matching bits for results | |
*/ | |
search (phash, threshold, maxResults) { | |
if (phash.length !== this.bytesPerHash * 2) { | |
throw new Error(`phash ${phash} does not have correct length (actual: ${phash.length}, expected ${this.bytesPerHash * 2}`) | |
} | |
const value = Buffer.from(phash, 'hex') | |
const results = [] | |
let count = 0 | |
for (let index = 0; index < this.maxSize; index++) { | |
let dist = this.hammingDist(index, value) | |
// Keep the 20 best items | |
if (dist >= threshold) { | |
count++ | |
// If we don't have enough results yet, if the current result is good enough, add it. Then sort | |
if (results.length < maxResults || results[maxResults - 1].dist < dist) { | |
results.push({index, dist}) | |
results.sort((x1, x2) => x2.dist - x1.dist) | |
} | |
} | |
} | |
// Sort reverse by dist | |
return { | |
count, results: results.slice(0, maxResults) | |
} | |
} | |
} | |
/** | |
* Count the matching bits two bytes based on there biwise-xor value | |
* @param xor | |
* @return {number} | |
*/ | |
function countMatchingBits (xor) { | |
// Assume all bytes match, xor is zero for all matching bits | |
let result = 8 | |
for (let n = 0; n < 8; n++) { | |
// Reduce count if n-th bit of xor is not zero | |
result -= (xor >> n) & 1 | |
} | |
return result | |
} | |
function performanceTest (nrOfHashes, threshold) { | |
console.error(`${nrOfHashes / 1000000} million hashes with threshold ${threshold}`) | |
const hashBuffer = new PhashBuffer(9, nrOfHashes) | |
const startTime = Date.now() | |
const nextValue = Buffer.alloc(9) | |
for (let i = 0; i < nrOfHashes; i++) { | |
nextValue.writeUInt32BE(i, 5) | |
hashBuffer.add(nextValue.toString('hex')) | |
} | |
console.error(`Buffer filled in ${Date.now() - startTime} ms`) | |
const searchStartTime = Date.now() | |
let phash = '000000000000000005' | |
console.error('Searching for ' + phash) | |
const {count, results} = hashBuffer.search(phash, threshold, 10) | |
let searchTime = Date.now() - searchStartTime | |
console.error(`Searched in ${searchTime} ms`) | |
console.error(`Got ${count} results`) | |
console.error(`Best results: `, results.map(({index, dist}) => ({index, dist, value: hashBuffer.get(index)}))) | |
console.error(`---------------------------------------`) | |
console.log(`TLDR: ${nrOfHashes / 1000000} million and thres ${threshold} => ${searchTime} ms (${count} found)`) | |
} | |
console.error = () => {} | |
performanceTest(10000, 60) | |
performanceTest(50000, 60) | |
performanceTest(2000000, 60) | |
performanceTest(4000000, 60) | |
performanceTest(4000000, 70) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
POC implementation of a buffer containing 72-bit phashes (perceptive hashes for image comparison).
I wanted to know if iterating a buffer like that is a feasible method for finding similar images in a database.
For me, the answer is yes.
Stats for 4 million hashes searched in 139 ms on my laptop