Skip to content

Instantly share code, notes, and snippets.

@sebilasse
Created August 4, 2024 13:10
Show Gist options
  • Save sebilasse/b5d2b484ff343ba59ac3f349571f1130 to your computer and use it in GitHub Desktop.
Save sebilasse/b5d2b484ff343ba59ac3f349571f1130 to your computer and use it in GitHub Desktop.
abstracted wd to as as described to max
// import wikidata from "https://esm.sh/[email protected]";
import { WBK } from "https://esm.sh/[email protected]";
import { createHash } from "https://deno.land/[email protected]/hash/mod.ts";
import { HEADERS } from "@/lib/constants.ts";
import { quantityToAS } from "./Unit.ts";
import { Redaktor } from "./context/index.ts";
// import { WikidataSPARQL } from "./context/indexCheck.ts";
const wikidata = WBK({
instance: 'https://www.wikidata.org',
sparqlEndpoint: 'https://query.wikidata.org/sparql'
});
export const {getSitelinkUrl} = WBK;
interface WikiPropertyVar {
name: string;
property: string;
container: string | string[];
type: string | string[];
functional: boolean;
}
const wdPropertyToAS: any = {
icon: {
P2910: ['Image', 'wd:Q138754'],
P154: ['Image', 'wd:Q1886349'],
P8972: ['Image', 'wd:Q2130'],
P367: ['Image', 'wd:Q645745'],
P2238: ['Document', 'wd:Q80071']
},
image: {
P18: ['Image', 'wd:Q478798'],
P41: ['Image', 'wd:Q14660'],
P94: ['Image', 'wd:Q14659'],
P242: ['Image', 'wd:Q6664848'],
P158: ['Image', 'wd:Q162919'],
P948: ['Image', 'wd:Q22920576'],
P8592: ['Image', 'wd:Q56240104'],
P3451: ['Image', 'wd:Q28333482'],
P4291: ['Image', 'wd:Q41363'],
P5252: ['Image', 'wd:Q54819662'],
P3311: ['Image', 'wd:Q611203'],
P5282: ['Image', 'wd:Q3931145'],
P117: ['Image', 'wd:Q496353'],
P14: ['Image', 'wd:Q5759965'],
P15: ['Image', 'wd:Q2298569'],
P181: ['Image', 'wd:Q4257161'],
P207: ['Image', 'wd:Q810826'],
P10093: ['Image', 'wd:Q109592922'],
P11832: ['Image', 'wd:Q725252'],
P1442: ['Image', 'wd:Q381885'],
P1543: ['Image', 'wd:Q168346'],
P1621: ['Image', 'wd:Q4006'],
P1766: ['Image', 'wd:Q55498668'],
P1801: ['Image', 'wd:Q721747'],
P1846: ['Image', 'wd:Q97378230'],
P1943: ['Image', 'wd:Q4006'],
P1944: ['Image', 'wd:Q2940514'],
P2713: ['Image', 'wd:Q12139782'],
P2716: ['Image', 'wd:Q170593'],
P3383: ['Image', 'wd:Q374821'],
P4004: ['Image', 'wd:Q331357'],
P4640: ['Image', 'wd:Q658252'],
P4896: ['Image', 'wd:Q3859833'],
P491: ['Image', 'wd:Q4130'],
P5775: ['Image', 'wd:Q2998430'],
P6655: ['Image', 'wd:Q2991980'],
P6802: ['Image', 'wd:Q478798'],
P692: ['Image', 'wd:Q7187'],
P8512: ['Image', 'wd:Q97650002'],
P8517: ['Image', 'wd:Q2075301'],
P9721: ['Image', 'wd:Q1137365'],
P9906: ['Image', 'wd:Q1640824']
},
attachment: {
P10: ['Video', 'wd:Q98069877'],
P443: ['Audio', 'wd:Q184377'], // pronunciation audio
P898: ['Audio', 'wd:Q21204'], // IPA transcription
P989: ['Audio', 'wd:Q653542', 'wd:Q106581471'], // "spoken text audio"
P990: ['Audio', 'wd:Q53702817'], // audio recording of the subject's spoken voice
P51: ['Audio', 'wd:Q26987229'] // audio
}
};
// TODO : might be a wikidata entry with official site property
const wdUrlPropertyToAS: any = {
P856: {type: ['Link'], rel: 'me', name: 'Website'},
P2699: {type: ['Link'], rel: 'about', name: 'About'},
P5996: {type: ['Link'], rel: 'related', name: 'Related Site'},
P4238: {type: ['Link'], rel: 'related webcam', name: 'Webcam'},
P485: {type: ['Link'], rel: 'related archives', name: 'Archive'},
P5305: {type: ['Link'], rel: 'related sparql', name: 'Sparql'},
P8402: {type: ['Link'], rel: 'content-repository openDataPortal', name: 'Open Data Portal'},
P1324: {type: ['Link'], rel: 'code-repository', name: 'Repository'},
P935: {type: ['Link'], rel: 'related gallery', name: 'Gallery', prefix: 'https://commons.wikimedia.org/wiki/'},
P373: {type: ['Link'], rel: 'related category', name: 'Category', prefix: 'https://commons.wikimedia.org/wiki/'}
};
function simpleWDmap(propO: any) {
if (propO.property.indexOf('wdt:') === 0) {
return {
...propO,
property: propO.property.replace('wdt:', ''),
wdt: true,
};
} else if (
propO.property.indexOf(
'https://www.wikidata.org/prop/direct/',
) === 0
) {
return {
...propO,
property: propO.property.replace(
'https://www.wikidata.org/prop/direct/',
'',
),
wdt: true,
};
}
return {
...propO,
wdt: false,
};
}
function simpleWDfilter(propO: any) {
return !!propO.wdt;
}
function simpleWDreduce(r: any, propO: any) {
r[propO.property] = propO;
return r;
}
async function fetchManyEntities(
ids: string[],
languages = ['en', 'de', 'fr', 'es', 'pt'],
properties?: any
) {
languages = languages.map((l) => l.indexOf('-') < 0 ? l.toLowerCase() : l);
const urls = wikidata.getManyEntities({
ids,
languages, // returns all languages if not specified
redirections: false, // defaults to true
});
// console.log(Object.keys(properties))
// console.log(urls.length, 'requests following ...');
const resArray = await Promise.all(
urls.map((url: string, i: number) =>
new Promise((resolve) => {
setTimeout(() => {
try {
fetch(url, HEADERS.GET.JSON).then((r) => {
if (!r.ok) resolve([]);
return r.json();
}).then((res: any) => {
if (!res) return resolve([]);
const { entities } = res;
try {
resolve(entities);
} catch (e) {
console.log(e);
resolve([]);
}
}).catch((e) => {
console.log(e);
resolve([]);
});
} catch (e) {
resolve([]);
}
}, (400 * i) + (Math.random() * 800));
})
)
);
const res = resArray.reduce((o, entities) => {
const simplified = wikidata.simplify.entities(entities);
const simplifiedQuali = wikidata.simplify.entities(entities,{ keepTypes: true, keepRichValues: true });
for (const key in simplified) {
o[key] = simplified[key];
for (const k in o[key].claims) {
const q = simplifiedQuali[key].claims[k];
if (Array.isArray(q) && q.length && q[0].type === 'quantity' && typeof q[0].value === 'object') {
try {
let {amount, unit} = q[0].value;
if (unit === '1') {
o[key].claims[k] = amount;
} else {
const asQty = quantityToAS(amount, unit);
if ((properties[k]?.type||[]).filter((t) => t === 'xsd:int' || t.indexOf('Integer') > -1).length) {
amount = Math.round(amount)
}
o[key].claims[k] = asQty.unit ? asQty : asQty.amount;
}
} catch(e) {
o[key].claims[k] = amount;
}
}
}
if (entities[key] && Array.isArray(entities[key]?.claims?.P625) && entities[key].claims.P625.length) {
if (entities[key].claims.P625[0].mainsnak?.datavalue?.value) {
o[key].location = entities[key].claims.P625[0].mainsnak.datavalue.value;
if (o[key]?.location?.precision) {
const accuracy = 100-(Math.round(o[key].location.precision * 111000 / 1000));
o[key].location.accuracy = accuracy||90;
o[key].location['wd:precision'] = o[key].location.precision;
delete o[key].location.precision;
}
}
}
if (simplified[key]?.claims?.P2044) {
if (!o[key].location) { o[key].location = {}; }
if (!o[key].location.altitude) {
o[key].location.altitude =
Array.isArray(simplified[key].claims.P2044) && simplified[key].claims.P2044.length
? simplified[key].claims.P2044[0]
: simplified[key].claims.P2044;
}
}
}
return o;
}, {});
return res;
}
function getWikiImgSrc(filename: string) {
const wmBase = 'https://upload.wikimedia.org/wikipedia/commons/';
const wms = filename.replace(/\s/g, '_');
const md5s = createHash('md5').update(wms).toString();
const [a, b] = [md5s.charAt(0), md5s.charAt(1)];
return `${wmBase}${a}/${a}${b}/${wms}`;
}
export function createWikiMedia(src: string, context?: string) {
if (!src || typeof src !== 'string') {
return {};
}
src = getWikiImgSrc(src);
const url: any = [
{
type: 'Link',
href: src
},
];
if (context) {
url[0].context = Array.isArray(context) ? context : [context];
}
const imgSizeM = src.match(/(\d{2,3})x(\d{2,3})[.]([a-z]*)([?]|$)/i) || [];
const imgTypeM = src.match(/[.]([a-z]*)([?]|$)/i) || [];
const imgSize = imgSizeM.length > 2 && [imgSizeM[1], imgSizeM[2]];
const imgType = imgTypeM.length > 1 && imgTypeM[1].toLowerCase();
if (imgType === 'svg') {
url[0].mediaType = 'image/svg+xml';
} else if (imgType === 'png' || imgType === 'jpg') {
url[0].mediaType = `image/${imgType}`;
}
if (imgSize) {
url[0].width = imgSize[0];
url[0].height = imgSize[1];
}
return {
type: 'Image',
url
};
}
export function getPropertiesFromContext(
contexts: any,
result: WikiPropertyVar[] = [],
) {
if (!Array.isArray(contexts)) contexts = [contexts];
return contexts.reduce((res: any, c: any) => {
if (Array.isArray(c)) {
res = getPropertiesFromContext(c, res);
} else if (typeof c === 'object') {
const o = c['@context'] ? c['@context'] : c;
for (const name in o) {
//shortName: { '@id': 'wdt:P1813', '@type': ['owl:FunctionalProperty'] },
const property = o[name]['@id'];
if (property) {
const type = o[name]['@type'];
const container = o[name]['@container'];
const fA = (Array.isArray(type) ? type : [type])
.filter((s) =>
typeof s === 'string' && s.indexOf('FunctionalProperty') > -1
);
res.push({
name,
property,
type,
container,
functional: !!fA.length,
prefix: o[name].prefix || '',
suffix: o[name].suffix || '',
});
}
}
}
return res;
}, result);
}
// TODO from cache
const properties = getPropertiesFromContext(
// TODO :
/*[WikidataSPARQL,Redaktor.$context],*/
Redaktor.$context
).map(simpleWDmap).filter(simpleWDfilter).reduce(simpleWDreduce, {});
// console.log(properties);
const wikiIdRegex = /^Q\d+$/;
export async function wikiDetails(
ids: string[],
type: string[] = [],
languages = ['en', 'de', 'fr', 'es', 'pt', 'ar'],
filepath: false | string = false,
countryCheckMap?: {[wdId: string]: string /* ISO */},
idMap?: {[wdId: string]: string /* ISO */},
) {
idMap = {...countryCheckMap, ...idMap};
languages = languages.map((l) => l.indexOf('-') < 0 ? l.toLowerCase() : l);
const wdRes = await fetchManyEntities(ids, languages, properties);
let results: any[] = [];
const toLDlink = (t: string) => {
return t && typeof t === 'string' && wikiIdRegex.test(t)
? (!!idMap[t] ? `/wiki/${idMap[t]}` : `wd:${t}`)
: `${t}`
}
// console.log(wdRes);
for (const wdId in wdRes) {
const {id, location = {}, ...wd} = wdRes[wdId];
const res: any = {
type,
id,
name: [],
updated: wd.modified || new Date().toISOString(),
describes: [ `wd:${wdId}` ],
nameMap: wd.labels ? wd.labels : {},
alternativeNameMap: wd.aliases ? wd.aliases : {},
summaryMap: wd.descriptions ? wd.descriptions : {},
location
};
if (!wd?.claims) {
console.log('not found', wdId);
continue;
}
if (countryCheckMap && wd.claims.P17 && Array.isArray(wd.claims.P17) && wd.claims.P17.indexOf(Object.keys(countryCheckMap)[0]) < 0) {
console.log('Country error for:');
console.log('"'+id+'":["'+wdId+'", '+JSON.stringify(wd.claims.P17)+'],');
};
// TODO P740 eventLocation to location and // P706 located in/on physical feature
// TODO P580 startTime and P582 endTime and P585 point in time / duration schema
// 'schema:startDate', 'schema:endDate', 'schema:previousStartDate',
if (wd.claims.P1705) { // native name
res.name = res.name.concat(Array.isArray(wd.claims.P1705) ? wd.claims.P1705 : [wd.claims.P1705]);
}
if (wd.claims.P1448) { // official name
res.name = res.name.concat(Array.isArray(wd.claims.P1448) ? wd.claims.P1448 : [wd.claims.P1448]);
}
if (wd.claims.P31) {
res.type = res.type.concat(wd.claims.P31.map(toLDlink));
}
res.url = [
{
"type": "Link",
"rel": "about",
"href": `https://www.wikidata.org/wiki/Special:EntityData/${wdId}`,
"nameMap": {
"en": "details from wikidata",
"fr": "détails de wikidata",
"de": "Details von wikidata",
"es": "detalles de wikidata",
"pt": "pormenores da wikidata"
},
"mediaType": "application/ld+json"
},
{
"type": "Link",
"rel": "about",
"href": `https://www.wikidata.org/entity/${wdId}`,
"name": "wikidata",
"mediaType": "text/html"
}
];
// TODO static GN IDs from mapping
if (wd.claims.P1566) {
wd.claims.P1566.forEach((gnId: string) => {
res.url.push({
"type": "Link",
"rel": "about",
"href": `https://www.geonames.org/${gnId}`,
"name": "geonames",
"mediaType": "text/html"
});
});
}
// TODO static OSM IDs from mapping
for (const a of [['P402','relation'], ['P11693','node'], ['P10689','way']]) {
if (wd.claims[a[0]]) {
wd.claims[a[0]].forEach((osmId: string) => {
res.url.push({
"type": "Link",
"rel": "about",
"href": `https://openstreetmap.org/${a[1]}/${osmId}`,
"name": "OpenStreetMap",
"mediaType": "text/html"
});
});
}
}
if (wd.sitelinks) {
// console.log(wd.sitelinks);
languages.forEach((lang) => {
for (const key in wd.sitelinks) {
if (`${key}.`.indexOf(lang) === 0) {
let href;
try {
const title = wd.sitelinks[key];
href = wikidata.getSitelinkUrl(key, title);
res.url.push({
type: 'Link',
name: title,
hreflang: lang,
href,
mediaType: 'text/html',
});
} catch (e) {
// console.log(e);
}
}
}
});
}
// short name
if (wd.claims.P1813) {
if (!Array.isArray(wd.claims.P1813)) {
wd.claims.P1813 = [wd.claims.P1813];
}
if (!res.tag) { res.tag = [] }
res.tag = res.tag.concat(
wd.claims.P1813.map((name: string) => ({
type: 'Hashtag',
name,
})),
);
}
// hashtag
if (wd.claims.P2572) {
if (!Array.isArray(wd.claims.P2572)) {
wd.claims.P2572 = [wd.claims.P2572];
}
if (!res.tag) { res.tag = [] }
res.tag = res.tag.concat(
wd.claims.P2572.map((name: string) => ({
type: 'Hashtag',
name,
})),
);
}
for (const key in wd.claims) {
if (!Array.isArray(wd.claims[key])) {
wd.claims[key] = [wd.claims[key]];
}
let isInContainer = false;
for (const container in wdPropertyToAS) {
for (const k in wdPropertyToAS[container]) {
if (key !== k) { continue }
wd.claims[key].forEach((value: any) => {
if (!res[container]) { res[container] = [] }
res[container].push(
{
...createWikiMedia(
`${value}`,
`https://www.wikidata.org/prop/direct/${key}`,
),
...{type: wdPropertyToAS[container][key]}
}
);
});
isInContainer = true;
// wd.claims[key]
}
}
for (const k in wdUrlPropertyToAS) {
if (key !== k) { continue }
const {type, rel, name, prefix} = wdUrlPropertyToAS[k];
if (!res.url) { res.url = [] }
wd.claims[key].forEach((value: any) => {
const href = prefix
? `${prefix}${value}`
: (wikiIdRegex.test(value)
? `wd:${value}`
: (!!idMap[value] ? `/wiki/${idMap[value]}` : value)
);
res.url.push({
type,
name,
rel,
href,
mediaType: wdUrlPropertyToAS[k].mediaType || 'text/html'
});
});
isInContainer = true;
}
if (isInContainer) { continue }
if (wd.claims.P17) {
res.country = wd.claims.P17.map(toLDlink);
}
if (
// tags
key === 'P1813' || key === 'P2572' ||
// coordinates (to bbox)
key === 'P1332' || key === 'P1333' ||
key === 'P1334' || key === 'P1335' ||
!properties[key]
) {
continue;
}
const {
name,
functional,
container = [],
type = [],
prefix: p = '',
suffix = '',
} = properties[key];
const isSet = container && container.indexOf('@set') > -1;
const isId = container && container.indexOf('@id') > -1;
const prefix = (!p && isId)
? 'https://www.wikidata.org/wiki/'
: p;
wd.claims[key].forEach((value: any) => {
try {
if (!functional) {
// grouped
if (!Array.isArray(value)) value = [value];
const target = isSet ? Array.from(new Set(value)) : value;
if (!res[name]) {
res[name] = [];
}
res[name] = res[name].concat(target.map(toLDlink));
} else {
// functional
res[name] = (Array.isArray(value) ? value : [value]).map(toLDlink)[0];
}
} catch(e) {
console.log(key, name);
}
});
if (res[name]) {
if (type.indexOf('xsd:decimal') > -1) {
if (Array.isArray(res[name])) {
res[name] = res[name].map((s: string) => !s ? 0 : parseFloat(s));
} else if (typeof res[name] === 'string') {
res[name] = !res[name] ? 0 : parseFloat(res[name]);
}
} else if (type.indexOf('xsd:nonNegativeInteger') > -1) {
if (Array.isArray(res[name])) {
res[name] = res[name].map((s: string) =>
!s ? 0 : parseInt(s, 10)
);
} else if (typeof res[name] === 'string') {
res[name] = !res[name] ? 0 : parseInt(res[name], 10);
}
} else if (type.indexOf('xsd:positiveInteger') > -1) {
if (Array.isArray(res[name])) {
res[name] = res[name].map((s: string) => parseInt(s, 10)||null);
} else if (typeof res[name] === 'string') {
res[name] = parseInt(res[name], 10)||null;
}
} else if (prefix || suffix) {
if (Array.isArray(res[name])) {
res[name] = res[name].map((s: string) =>
!s ? '' : `${prefix}${s}${suffix}`
);
} else {
res[name] = `${prefix}${res[name]}${suffix}`;
}
}
}
}
if (
!res.bbox && wd.claims.P1332 && wd.claims.P1333 &&
wd.claims.P1334 && wd.claims.P1335
) {
// N, S, E, W
const { P1332, P1333, P1334, P1335 } = wd.claims;
if (P1332.length && P1333.length && P1334.length && P1335.length) {
res.bbox = {
north: typeof P1332[0] === 'number' ? P1332[0] : P1332[0][0],
south: typeof P1333[0] === 'number' ? P1333[0] : P1333[0][0],
east: typeof P1334[0] === 'number' ? P1334[0] : P1334[0][0],
west: typeof P1335[0] === 'number' ? P1335[0] : P1335[0][0]
};
}
}
if (filepath) {
Deno.writeTextFileSync(`${filepath}/${wdId}.json`, JSON.stringify(res, null, 2));
}
results.push(res);
}
return results;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment