Skip to content

Instantly share code, notes, and snippets.

@dridk
Created June 28, 2025 12:23
Show Gist options
  • Save dridk/078e6f756603e48be467a9922b26a6d0 to your computer and use it in GitHub Desktop.
Save dridk/078e6f756603e48be467a9922b26a6d0 to your computer and use it in GitHub Desktop.
extraction des code ADICAP depuis un fichier RDF dans un fichier parquet
import rdflib
import polars
# Chargement de la terminologie
g = rdflib.Graph()
g.parse("adicap.rdf")
# Requete SPARQL pour récupérer les variables d'interets
query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX adicap: <https://data.esante.gouv.fr/adicap/>
SELECT ?concept ?code ?label ?dict ?path
WHERE {
?concept rdfs:subClassOf* adicap:ADICAP .
?concept rdfs:label ?label.
?concept skos:notation ?code.
?concept rdfs:subClassOf+ ?superClass.
?concept adicap:dictionaryCode ?dict.
?superClass skos:notation ?path.
}
"""
# Execution de la requete SPARQL
records = g.query(sparql)
# Génération d'un dataframe pola.rs
columns = [str(i) for i in records.vars]
recs = []
for rec in records:
if isinstance(rec, tuple):
recs.append(rec)
else:
raise TypeError("Records must contains iterable ")
df = pl.DataFrame([{str.upper(columns[i]): str(v) for i, v in enumerate(rec)} for rec in recs])
df = df.group_by("CONCEPT").agg(
pl.col("CODE").first(),
pl.col("LABEL").first(),
pl.col("PATH").reverse(),
pl.col("TYPE").first(),
)
df.write_parquet("adicap.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment