Created
August 4, 2024 13:10
-
-
Save sebilasse/b5d2b484ff343ba59ac3f349571f1130 to your computer and use it in GitHub Desktop.
abstracted wd to as as described to max
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// import wikidata from "https://esm.sh/[email protected]"; | |
import { WBK } from "https://esm.sh/[email protected]"; | |
import { createHash } from "https://deno.land/[email protected]/hash/mod.ts"; | |
import { HEADERS } from "@/lib/constants.ts"; | |
import { quantityToAS } from "./Unit.ts"; | |
import { Redaktor } from "./context/index.ts"; | |
// import { WikidataSPARQL } from "./context/indexCheck.ts"; | |
const wikidata = WBK({ | |
instance: 'https://www.wikidata.org', | |
sparqlEndpoint: 'https://query.wikidata.org/sparql' | |
}); | |
export const {getSitelinkUrl} = WBK; | |
interface WikiPropertyVar { | |
name: string; | |
property: string; | |
container: string | string[]; | |
type: string | string[]; | |
functional: boolean; | |
} | |
const wdPropertyToAS: any = { | |
icon: { | |
P2910: ['Image', 'wd:Q138754'], | |
P154: ['Image', 'wd:Q1886349'], | |
P8972: ['Image', 'wd:Q2130'], | |
P367: ['Image', 'wd:Q645745'], | |
P2238: ['Document', 'wd:Q80071'] | |
}, | |
image: { | |
P18: ['Image', 'wd:Q478798'], | |
P41: ['Image', 'wd:Q14660'], | |
P94: ['Image', 'wd:Q14659'], | |
P242: ['Image', 'wd:Q6664848'], | |
P158: ['Image', 'wd:Q162919'], | |
P948: ['Image', 'wd:Q22920576'], | |
P8592: ['Image', 'wd:Q56240104'], | |
P3451: ['Image', 'wd:Q28333482'], | |
P4291: ['Image', 'wd:Q41363'], | |
P5252: ['Image', 'wd:Q54819662'], | |
P3311: ['Image', 'wd:Q611203'], | |
P5282: ['Image', 'wd:Q3931145'], | |
P117: ['Image', 'wd:Q496353'], | |
P14: ['Image', 'wd:Q5759965'], | |
P15: ['Image', 'wd:Q2298569'], | |
P181: ['Image', 'wd:Q4257161'], | |
P207: ['Image', 'wd:Q810826'], | |
P10093: ['Image', 'wd:Q109592922'], | |
P11832: ['Image', 'wd:Q725252'], | |
P1442: ['Image', 'wd:Q381885'], | |
P1543: ['Image', 'wd:Q168346'], | |
P1621: ['Image', 'wd:Q4006'], | |
P1766: ['Image', 'wd:Q55498668'], | |
P1801: ['Image', 'wd:Q721747'], | |
P1846: ['Image', 'wd:Q97378230'], | |
P1943: ['Image', 'wd:Q4006'], | |
P1944: ['Image', 'wd:Q2940514'], | |
P2713: ['Image', 'wd:Q12139782'], | |
P2716: ['Image', 'wd:Q170593'], | |
P3383: ['Image', 'wd:Q374821'], | |
P4004: ['Image', 'wd:Q331357'], | |
P4640: ['Image', 'wd:Q658252'], | |
P4896: ['Image', 'wd:Q3859833'], | |
P491: ['Image', 'wd:Q4130'], | |
P5775: ['Image', 'wd:Q2998430'], | |
P6655: ['Image', 'wd:Q2991980'], | |
P6802: ['Image', 'wd:Q478798'], | |
P692: ['Image', 'wd:Q7187'], | |
P8512: ['Image', 'wd:Q97650002'], | |
P8517: ['Image', 'wd:Q2075301'], | |
P9721: ['Image', 'wd:Q1137365'], | |
P9906: ['Image', 'wd:Q1640824'] | |
}, | |
attachment: { | |
P10: ['Video', 'wd:Q98069877'], | |
P443: ['Audio', 'wd:Q184377'], // pronunciation audio | |
P898: ['Audio', 'wd:Q21204'], // IPA transcription | |
P989: ['Audio', 'wd:Q653542', 'wd:Q106581471'], // "spoken text audio" | |
P990: ['Audio', 'wd:Q53702817'], // audio recording of the subject's spoken voice | |
P51: ['Audio', 'wd:Q26987229'] // audio | |
} | |
}; | |
// TODO : might be a wikidata entry with official site property | |
const wdUrlPropertyToAS: any = { | |
P856: {type: ['Link'], rel: 'me', name: 'Website'}, | |
P2699: {type: ['Link'], rel: 'about', name: 'About'}, | |
P5996: {type: ['Link'], rel: 'related', name: 'Related Site'}, | |
P4238: {type: ['Link'], rel: 'related webcam', name: 'Webcam'}, | |
P485: {type: ['Link'], rel: 'related archives', name: 'Archive'}, | |
P5305: {type: ['Link'], rel: 'related sparql', name: 'Sparql'}, | |
P8402: {type: ['Link'], rel: 'content-repository openDataPortal', name: 'Open Data Portal'}, | |
P1324: {type: ['Link'], rel: 'code-repository', name: 'Repository'}, | |
P935: {type: ['Link'], rel: 'related gallery', name: 'Gallery', prefix: 'https://commons.wikimedia.org/wiki/'}, | |
P373: {type: ['Link'], rel: 'related category', name: 'Category', prefix: 'https://commons.wikimedia.org/wiki/'} | |
}; | |
function simpleWDmap(propO: any) { | |
if (propO.property.indexOf('wdt:') === 0) { | |
return { | |
...propO, | |
property: propO.property.replace('wdt:', ''), | |
wdt: true, | |
}; | |
} else if ( | |
propO.property.indexOf( | |
'https://www.wikidata.org/prop/direct/', | |
) === 0 | |
) { | |
return { | |
...propO, | |
property: propO.property.replace( | |
'https://www.wikidata.org/prop/direct/', | |
'', | |
), | |
wdt: true, | |
}; | |
} | |
return { | |
...propO, | |
wdt: false, | |
}; | |
} | |
function simpleWDfilter(propO: any) { | |
return !!propO.wdt; | |
} | |
function simpleWDreduce(r: any, propO: any) { | |
r[propO.property] = propO; | |
return r; | |
} | |
async function fetchManyEntities( | |
ids: string[], | |
languages = ['en', 'de', 'fr', 'es', 'pt'], | |
properties?: any | |
) { | |
languages = languages.map((l) => l.indexOf('-') < 0 ? l.toLowerCase() : l); | |
const urls = wikidata.getManyEntities({ | |
ids, | |
languages, // returns all languages if not specified | |
redirections: false, // defaults to true | |
}); | |
// console.log(Object.keys(properties)) | |
// console.log(urls.length, 'requests following ...'); | |
const resArray = await Promise.all( | |
urls.map((url: string, i: number) => | |
new Promise((resolve) => { | |
setTimeout(() => { | |
try { | |
fetch(url, HEADERS.GET.JSON).then((r) => { | |
if (!r.ok) resolve([]); | |
return r.json(); | |
}).then((res: any) => { | |
if (!res) return resolve([]); | |
const { entities } = res; | |
try { | |
resolve(entities); | |
} catch (e) { | |
console.log(e); | |
resolve([]); | |
} | |
}).catch((e) => { | |
console.log(e); | |
resolve([]); | |
}); | |
} catch (e) { | |
resolve([]); | |
} | |
}, (400 * i) + (Math.random() * 800)); | |
}) | |
) | |
); | |
const res = resArray.reduce((o, entities) => { | |
const simplified = wikidata.simplify.entities(entities); | |
const simplifiedQuali = wikidata.simplify.entities(entities,{ keepTypes: true, keepRichValues: true }); | |
for (const key in simplified) { | |
o[key] = simplified[key]; | |
for (const k in o[key].claims) { | |
const q = simplifiedQuali[key].claims[k]; | |
if (Array.isArray(q) && q.length && q[0].type === 'quantity' && typeof q[0].value === 'object') { | |
try { | |
let {amount, unit} = q[0].value; | |
if (unit === '1') { | |
o[key].claims[k] = amount; | |
} else { | |
const asQty = quantityToAS(amount, unit); | |
if ((properties[k]?.type||[]).filter((t) => t === 'xsd:int' || t.indexOf('Integer') > -1).length) { | |
amount = Math.round(amount) | |
} | |
o[key].claims[k] = asQty.unit ? asQty : asQty.amount; | |
} | |
} catch(e) { | |
o[key].claims[k] = amount; | |
} | |
} | |
} | |
if (entities[key] && Array.isArray(entities[key]?.claims?.P625) && entities[key].claims.P625.length) { | |
if (entities[key].claims.P625[0].mainsnak?.datavalue?.value) { | |
o[key].location = entities[key].claims.P625[0].mainsnak.datavalue.value; | |
if (o[key]?.location?.precision) { | |
const accuracy = 100-(Math.round(o[key].location.precision * 111000 / 1000)); | |
o[key].location.accuracy = accuracy||90; | |
o[key].location['wd:precision'] = o[key].location.precision; | |
delete o[key].location.precision; | |
} | |
} | |
} | |
if (simplified[key]?.claims?.P2044) { | |
if (!o[key].location) { o[key].location = {}; } | |
if (!o[key].location.altitude) { | |
o[key].location.altitude = | |
Array.isArray(simplified[key].claims.P2044) && simplified[key].claims.P2044.length | |
? simplified[key].claims.P2044[0] | |
: simplified[key].claims.P2044; | |
} | |
} | |
} | |
return o; | |
}, {}); | |
return res; | |
} | |
function getWikiImgSrc(filename: string) { | |
const wmBase = 'https://upload.wikimedia.org/wikipedia/commons/'; | |
const wms = filename.replace(/\s/g, '_'); | |
const md5s = createHash('md5').update(wms).toString(); | |
const [a, b] = [md5s.charAt(0), md5s.charAt(1)]; | |
return `${wmBase}${a}/${a}${b}/${wms}`; | |
} | |
export function createWikiMedia(src: string, context?: string) { | |
if (!src || typeof src !== 'string') { | |
return {}; | |
} | |
src = getWikiImgSrc(src); | |
const url: any = [ | |
{ | |
type: 'Link', | |
href: src | |
}, | |
]; | |
if (context) { | |
url[0].context = Array.isArray(context) ? context : [context]; | |
} | |
const imgSizeM = src.match(/(\d{2,3})x(\d{2,3})[.]([a-z]*)([?]|$)/i) || []; | |
const imgTypeM = src.match(/[.]([a-z]*)([?]|$)/i) || []; | |
const imgSize = imgSizeM.length > 2 && [imgSizeM[1], imgSizeM[2]]; | |
const imgType = imgTypeM.length > 1 && imgTypeM[1].toLowerCase(); | |
if (imgType === 'svg') { | |
url[0].mediaType = 'image/svg+xml'; | |
} else if (imgType === 'png' || imgType === 'jpg') { | |
url[0].mediaType = `image/${imgType}`; | |
} | |
if (imgSize) { | |
url[0].width = imgSize[0]; | |
url[0].height = imgSize[1]; | |
} | |
return { | |
type: 'Image', | |
url | |
}; | |
} | |
export function getPropertiesFromContext( | |
contexts: any, | |
result: WikiPropertyVar[] = [], | |
) { | |
if (!Array.isArray(contexts)) contexts = [contexts]; | |
return contexts.reduce((res: any, c: any) => { | |
if (Array.isArray(c)) { | |
res = getPropertiesFromContext(c, res); | |
} else if (typeof c === 'object') { | |
const o = c['@context'] ? c['@context'] : c; | |
for (const name in o) { | |
//shortName: { '@id': 'wdt:P1813', '@type': ['owl:FunctionalProperty'] }, | |
const property = o[name]['@id']; | |
if (property) { | |
const type = o[name]['@type']; | |
const container = o[name]['@container']; | |
const fA = (Array.isArray(type) ? type : [type]) | |
.filter((s) => | |
typeof s === 'string' && s.indexOf('FunctionalProperty') > -1 | |
); | |
res.push({ | |
name, | |
property, | |
type, | |
container, | |
functional: !!fA.length, | |
prefix: o[name].prefix || '', | |
suffix: o[name].suffix || '', | |
}); | |
} | |
} | |
} | |
return res; | |
}, result); | |
} | |
// TODO from cache | |
const properties = getPropertiesFromContext( | |
// TODO : | |
/*[WikidataSPARQL,Redaktor.$context],*/ | |
Redaktor.$context | |
).map(simpleWDmap).filter(simpleWDfilter).reduce(simpleWDreduce, {}); | |
// console.log(properties); | |
const wikiIdRegex = /^Q\d+$/; | |
export async function wikiDetails( | |
ids: string[], | |
type: string[] = [], | |
languages = ['en', 'de', 'fr', 'es', 'pt', 'ar'], | |
filepath: false | string = false, | |
countryCheckMap?: {[wdId: string]: string /* ISO */}, | |
idMap?: {[wdId: string]: string /* ISO */}, | |
) { | |
idMap = {...countryCheckMap, ...idMap}; | |
languages = languages.map((l) => l.indexOf('-') < 0 ? l.toLowerCase() : l); | |
const wdRes = await fetchManyEntities(ids, languages, properties); | |
let results: any[] = []; | |
const toLDlink = (t: string) => { | |
return t && typeof t === 'string' && wikiIdRegex.test(t) | |
? (!!idMap[t] ? `/wiki/${idMap[t]}` : `wd:${t}`) | |
: `${t}` | |
} | |
// console.log(wdRes); | |
for (const wdId in wdRes) { | |
const {id, location = {}, ...wd} = wdRes[wdId]; | |
const res: any = { | |
type, | |
id, | |
name: [], | |
updated: wd.modified || new Date().toISOString(), | |
describes: [ `wd:${wdId}` ], | |
nameMap: wd.labels ? wd.labels : {}, | |
alternativeNameMap: wd.aliases ? wd.aliases : {}, | |
summaryMap: wd.descriptions ? wd.descriptions : {}, | |
location | |
}; | |
if (!wd?.claims) { | |
console.log('not found', wdId); | |
continue; | |
} | |
if (countryCheckMap && wd.claims.P17 && Array.isArray(wd.claims.P17) && wd.claims.P17.indexOf(Object.keys(countryCheckMap)[0]) < 0) { | |
console.log('Country error for:'); | |
console.log('"'+id+'":["'+wdId+'", '+JSON.stringify(wd.claims.P17)+'],'); | |
}; | |
// TODO P740 eventLocation to location and // P706 located in/on physical feature | |
// TODO P580 startTime and P582 endTime and P585 point in time / duration schema | |
// 'schema:startDate', 'schema:endDate', 'schema:previousStartDate', | |
if (wd.claims.P1705) { // native name | |
res.name = res.name.concat(Array.isArray(wd.claims.P1705) ? wd.claims.P1705 : [wd.claims.P1705]); | |
} | |
if (wd.claims.P1448) { // official name | |
res.name = res.name.concat(Array.isArray(wd.claims.P1448) ? wd.claims.P1448 : [wd.claims.P1448]); | |
} | |
if (wd.claims.P31) { | |
res.type = res.type.concat(wd.claims.P31.map(toLDlink)); | |
} | |
res.url = [ | |
{ | |
"type": "Link", | |
"rel": "about", | |
"href": `https://www.wikidata.org/wiki/Special:EntityData/${wdId}`, | |
"nameMap": { | |
"en": "details from wikidata", | |
"fr": "détails de wikidata", | |
"de": "Details von wikidata", | |
"es": "detalles de wikidata", | |
"pt": "pormenores da wikidata" | |
}, | |
"mediaType": "application/ld+json" | |
}, | |
{ | |
"type": "Link", | |
"rel": "about", | |
"href": `https://www.wikidata.org/entity/${wdId}`, | |
"name": "wikidata", | |
"mediaType": "text/html" | |
} | |
]; | |
// TODO static GN IDs from mapping | |
if (wd.claims.P1566) { | |
wd.claims.P1566.forEach((gnId: string) => { | |
res.url.push({ | |
"type": "Link", | |
"rel": "about", | |
"href": `https://www.geonames.org/${gnId}`, | |
"name": "geonames", | |
"mediaType": "text/html" | |
}); | |
}); | |
} | |
// TODO static OSM IDs from mapping | |
for (const a of [['P402','relation'], ['P11693','node'], ['P10689','way']]) { | |
if (wd.claims[a[0]]) { | |
wd.claims[a[0]].forEach((osmId: string) => { | |
res.url.push({ | |
"type": "Link", | |
"rel": "about", | |
"href": `https://openstreetmap.org/${a[1]}/${osmId}`, | |
"name": "OpenStreetMap", | |
"mediaType": "text/html" | |
}); | |
}); | |
} | |
} | |
if (wd.sitelinks) { | |
// console.log(wd.sitelinks); | |
languages.forEach((lang) => { | |
for (const key in wd.sitelinks) { | |
if (`${key}.`.indexOf(lang) === 0) { | |
let href; | |
try { | |
const title = wd.sitelinks[key]; | |
href = wikidata.getSitelinkUrl(key, title); | |
res.url.push({ | |
type: 'Link', | |
name: title, | |
hreflang: lang, | |
href, | |
mediaType: 'text/html', | |
}); | |
} catch (e) { | |
// console.log(e); | |
} | |
} | |
} | |
}); | |
} | |
// short name | |
if (wd.claims.P1813) { | |
if (!Array.isArray(wd.claims.P1813)) { | |
wd.claims.P1813 = [wd.claims.P1813]; | |
} | |
if (!res.tag) { res.tag = [] } | |
res.tag = res.tag.concat( | |
wd.claims.P1813.map((name: string) => ({ | |
type: 'Hashtag', | |
name, | |
})), | |
); | |
} | |
// hashtag | |
if (wd.claims.P2572) { | |
if (!Array.isArray(wd.claims.P2572)) { | |
wd.claims.P2572 = [wd.claims.P2572]; | |
} | |
if (!res.tag) { res.tag = [] } | |
res.tag = res.tag.concat( | |
wd.claims.P2572.map((name: string) => ({ | |
type: 'Hashtag', | |
name, | |
})), | |
); | |
} | |
for (const key in wd.claims) { | |
if (!Array.isArray(wd.claims[key])) { | |
wd.claims[key] = [wd.claims[key]]; | |
} | |
let isInContainer = false; | |
for (const container in wdPropertyToAS) { | |
for (const k in wdPropertyToAS[container]) { | |
if (key !== k) { continue } | |
wd.claims[key].forEach((value: any) => { | |
if (!res[container]) { res[container] = [] } | |
res[container].push( | |
{ | |
...createWikiMedia( | |
`${value}`, | |
`https://www.wikidata.org/prop/direct/${key}`, | |
), | |
...{type: wdPropertyToAS[container][key]} | |
} | |
); | |
}); | |
isInContainer = true; | |
// wd.claims[key] | |
} | |
} | |
for (const k in wdUrlPropertyToAS) { | |
if (key !== k) { continue } | |
const {type, rel, name, prefix} = wdUrlPropertyToAS[k]; | |
if (!res.url) { res.url = [] } | |
wd.claims[key].forEach((value: any) => { | |
const href = prefix | |
? `${prefix}${value}` | |
: (wikiIdRegex.test(value) | |
? `wd:${value}` | |
: (!!idMap[value] ? `/wiki/${idMap[value]}` : value) | |
); | |
res.url.push({ | |
type, | |
name, | |
rel, | |
href, | |
mediaType: wdUrlPropertyToAS[k].mediaType || 'text/html' | |
}); | |
}); | |
isInContainer = true; | |
} | |
if (isInContainer) { continue } | |
if (wd.claims.P17) { | |
res.country = wd.claims.P17.map(toLDlink); | |
} | |
if ( | |
// tags | |
key === 'P1813' || key === 'P2572' || | |
// coordinates (to bbox) | |
key === 'P1332' || key === 'P1333' || | |
key === 'P1334' || key === 'P1335' || | |
!properties[key] | |
) { | |
continue; | |
} | |
const { | |
name, | |
functional, | |
container = [], | |
type = [], | |
prefix: p = '', | |
suffix = '', | |
} = properties[key]; | |
const isSet = container && container.indexOf('@set') > -1; | |
const isId = container && container.indexOf('@id') > -1; | |
const prefix = (!p && isId) | |
? 'https://www.wikidata.org/wiki/' | |
: p; | |
wd.claims[key].forEach((value: any) => { | |
try { | |
if (!functional) { | |
// grouped | |
if (!Array.isArray(value)) value = [value]; | |
const target = isSet ? Array.from(new Set(value)) : value; | |
if (!res[name]) { | |
res[name] = []; | |
} | |
res[name] = res[name].concat(target.map(toLDlink)); | |
} else { | |
// functional | |
res[name] = (Array.isArray(value) ? value : [value]).map(toLDlink)[0]; | |
} | |
} catch(e) { | |
console.log(key, name); | |
} | |
}); | |
if (res[name]) { | |
if (type.indexOf('xsd:decimal') > -1) { | |
if (Array.isArray(res[name])) { | |
res[name] = res[name].map((s: string) => !s ? 0 : parseFloat(s)); | |
} else if (typeof res[name] === 'string') { | |
res[name] = !res[name] ? 0 : parseFloat(res[name]); | |
} | |
} else if (type.indexOf('xsd:nonNegativeInteger') > -1) { | |
if (Array.isArray(res[name])) { | |
res[name] = res[name].map((s: string) => | |
!s ? 0 : parseInt(s, 10) | |
); | |
} else if (typeof res[name] === 'string') { | |
res[name] = !res[name] ? 0 : parseInt(res[name], 10); | |
} | |
} else if (type.indexOf('xsd:positiveInteger') > -1) { | |
if (Array.isArray(res[name])) { | |
res[name] = res[name].map((s: string) => parseInt(s, 10)||null); | |
} else if (typeof res[name] === 'string') { | |
res[name] = parseInt(res[name], 10)||null; | |
} | |
} else if (prefix || suffix) { | |
if (Array.isArray(res[name])) { | |
res[name] = res[name].map((s: string) => | |
!s ? '' : `${prefix}${s}${suffix}` | |
); | |
} else { | |
res[name] = `${prefix}${res[name]}${suffix}`; | |
} | |
} | |
} | |
} | |
if ( | |
!res.bbox && wd.claims.P1332 && wd.claims.P1333 && | |
wd.claims.P1334 && wd.claims.P1335 | |
) { | |
// N, S, E, W | |
const { P1332, P1333, P1334, P1335 } = wd.claims; | |
if (P1332.length && P1333.length && P1334.length && P1335.length) { | |
res.bbox = { | |
north: typeof P1332[0] === 'number' ? P1332[0] : P1332[0][0], | |
south: typeof P1333[0] === 'number' ? P1333[0] : P1333[0][0], | |
east: typeof P1334[0] === 'number' ? P1334[0] : P1334[0][0], | |
west: typeof P1335[0] === 'number' ? P1335[0] : P1335[0][0] | |
}; | |
} | |
} | |
if (filepath) { | |
Deno.writeTextFileSync(`${filepath}/${wdId}.json`, JSON.stringify(res, null, 2)); | |
} | |
results.push(res); | |
} | |
return results; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment