Created
June 28, 2025 12:23
-
-
Save dridk/078e6f756603e48be467a9922b26a6d0 to your computer and use it in GitHub Desktop.
extraction des code ADICAP depuis un fichier RDF dans un fichier parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rdflib | |
import polars | |
# Chargement de la terminologie | |
g = rdflib.Graph() | |
g.parse("adicap.rdf") | |
# Requete SPARQL pour récupérer les variables d'interets | |
query = """ | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#> | |
PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
PREFIX adicap: <https://data.esante.gouv.fr/adicap/> | |
SELECT ?concept ?code ?label ?dict ?path | |
WHERE { | |
?concept rdfs:subClassOf* adicap:ADICAP . | |
?concept rdfs:label ?label. | |
?concept skos:notation ?code. | |
?concept rdfs:subClassOf+ ?superClass. | |
?concept adicap:dictionaryCode ?dict. | |
?superClass skos:notation ?path. | |
} | |
""" | |
# Execution de la requete SPARQL | |
records = g.query(sparql) | |
# Génération d'un dataframe pola.rs | |
columns = [str(i) for i in records.vars] | |
recs = [] | |
for rec in records: | |
if isinstance(rec, tuple): | |
recs.append(rec) | |
else: | |
raise TypeError("Records must contains iterable ") | |
df = pl.DataFrame([{str.upper(columns[i]): str(v) for i, v in enumerate(rec)} for rec in recs]) | |
df = df.group_by("CONCEPT").agg( | |
pl.col("CODE").first(), | |
pl.col("LABEL").first(), | |
pl.col("PATH").reverse(), | |
pl.col("TYPE").first(), | |
) | |
df.write_parquet("adicap.parquet") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment