Last active
February 27, 2024 04:59
-
-
Save timathom/c3f2899cab3de4c4f9dff3f14c2e8490 to your computer and use it in GitHub Desktop.
Jaro-Winkler similarity in XQuery 4.0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "4.0"; | |
(:~ | |
: | |
: Module Name: Jaro-Winkler String Similarity | |
: Date: February 26, 2024 | |
: License: GPLv3 | |
: XQuery specification: 4.0 | |
: Dependencies: BaseX 11.0 | |
: @author @timathom | |
: | |
: Revisions by @ChristianGruen and @joewiz | |
: | |
:) | |
declare namespace ec = "__entity-checker__"; | |
declare function ec:jaro-winkler( | |
$string1 as xs:string+, | |
$string2 as xs:string+ | |
) as xs:double { | |
if (normalize-space($string1) and normalize-space($string2)) then ( | |
let $sorted := sort(($string1, $string2), (), fn{string-length()}) | |
let $s1 := characters($sorted[1]) | |
let $s2 := characters($sorted[2]) | |
let $s1-len := count($s1) | |
let $s2-len := count($s2) | |
let $len := max(($s1-len, $s2-len)) | |
(: Maximum distance for matching characters. :) | |
let $distance := max(($s1-len, $s2-len)) idiv 2 - 1 | |
(: Find matching characters. :) | |
let $matches := ec:find-matches($s1, $s2, 1, $distance, $len, ()) | |
let $strings := $matches[. instance of xs:string] | |
let $m := count($strings) | |
return if ($m > 0) then ( | |
(: Get the indices of matched characters in each string. :) | |
let $indices := fn($s) { | |
for $match at $p in $strings | |
let $curr := index-of($s, $match) | |
return head( | |
$curr[. ge $p] | |
[(items-at($matches, .) instance of xs:string) or (items-at($matches, . - 1) instance of xs:string)] | |
) otherwise $curr | |
} | |
let $s1-indices := $indices($s1) | |
let $s2-indices := $indices($s2) | |
(: Check for characters that matched but are transposed. :) | |
let $t := sum( | |
for $_1 at $p1 in $s1[$s1-indices] | |
for $_2 at $p2 in $s2[$s2-indices] | |
where $p1 = $p2 and $_1 != $_2 | |
return 1 | |
) div 2 | |
(: Jaro formula. :) | |
let $j := ($m div $s1-len + $m div $s2-len + ($m - $t) div $m) div 3 | |
(: Winkler prefix. :) | |
let $sub-s1 := subsequence($s1, 1, 4) | |
let $sub-s2 := subsequence($s2, 1, 4) | |
let $l := while-do(1, fn($n) { $sub-s1[$n] = $sub-s2[$n] }, fn($n) { $n + 1 }) - 1 | |
return $j + 0.1e0 * $l * (1 - $j) | |
) else 0 | |
) else 0 | |
}; | |
declare function ec:find-matches( | |
$s1 as xs:string*, | |
$s2 as xs:string*, | |
$i as xs:integer, | |
$distance as xs:integer, | |
$len as xs:integer, | |
$matches as item()* | |
) as item()* { | |
if ($i > $len) then $matches else ( | |
(: Set the distance window. :) | |
let $start := max((1, $i - $distance)) | |
let $end := min(($len, $i + $distance)) | |
(: Get the first match. :) | |
let $match := head(subsequence($s2, $start, $end)[. = $s1[$i]]) | |
(: Recurse. :) | |
return ec:find-matches($s1, $s2, $i + 1, $distance, $len, ($matches, $match otherwise 0)) | |
) | |
}; | |
(: Sample strings. :) | |
let $strings := [ | |
["FAREMVIEL", "FARMVILLE", 0.9189814815], | |
["Elon Musk", "Colon Muskratty", 0.7657407407], | |
["xabcdxxxxxx", " ydyaybycyyyyyy", 0.3767676768], | |
["Exlon", "Colon", 0.7333333333], | |
["MARTHA", "MARHTA", 0.9611111111], | |
["bukhari muhammad ismat allah ibn mahmud", "bukhari muhammad ismat allah ibn mahmud active 16th century", 0.9322033898], | |
["andrews duncan 1935-2011", "andrews duncan", 0.9166666667], | |
["DWAYNE", "DUANE", 0.8400000000], | |
["FLWOR Found.", "FLWOR Foundation", 0.9208333333], | |
["My Gym Children's Fitness Center", "My Gym. Childrens Fitness", .9420000000], | |
["DIXON", "DICKSONX", 0.8133333333], | |
["JELLYFISH", "SMELLYFISH", 0.8962962963] | |
] | |
(: Test results and compare to Levenshtein (BaseX implementation). :) | |
for member $string in $strings | |
return | |
(: ec:jaro-winkler($string?1, $string?2) :) | |
string-join( | |
(`Comparing ``{$string?1}`` to ``{$string?2}``.`, | |
(: `The Levenshtein similarity is {string:levenshtein($string?1, $string?2)}.`, :) | |
`The Jaro-Winkler similarity is {ec:jaro-winkler($string?1, $string?2)}.`, | |
`Expected {$string?3}`||" ") | |
, " " | |
) |
Fixed prefix variables.
Added fix for trailing space.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fixed bug causing truncated comparisons.