Skip to content

Instantly share code, notes, and snippets.

@dridk
Created June 28, 2025 12:19
Show Gist options
  • Save dridk/2cb015792bcf7db82cc7f9eb6f00f2fc to your computer and use it in GitHub Desktop.
Save dridk/2cb015792bcf7db82cc7f9eb6f00f2fc to your computer and use it in GitHub Desktop.
extraction des code CCAM depuis un fichier RDF dans un fichier parquet
import rdflib
import polars
# Chargement de la terminologie
g = rdflib.Graph()
g.parse("ccam.rdf")
# Requete SPARQL pour récupérer les variables d'interets
query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX ccam: <http://data.esante.gouv.fr/cnam/ccam/>
SELECT ?concept ?code ?label ?path ?synonyms ?topographie ?type_acte ?mode_acces
WHERE {
?concept rdfs:subClassOf* ccam:Acte .
?concept rdfs:label ?label.
?concept skos:notation ?code.
?concept rdfs:subClassOf+ ?superClass.
?superClass skos:notation ?path.
?concept ccam:topographie ?topographieConcept.
?topographieConcept rdfs:label ?topographie.
?concept ccam:typeActe ?typeActeConcept.
?typeActeConcept rdfs:label ?type_acte.
?concept ccam:modeAcces ?modeAccesConcept.
?modeAccesConcept rdfs:label ?mode_acces.
?concept ccam:action ?actionConcept.
?actionConcept rdfs:label ?action.
OPTIONAL { ?concept skos:altLabel ?synonyms. }
}
"""
# Execution de la requete SPARQL
records = g.query(sparql)
# Génération d'un dataframe pola.rs
columns = [str(i) for i in records.vars]
recs = []
for rec in records:
if isinstance(rec, tuple):
recs.append(rec)
else:
raise TypeError("Records must contains iterable ")
df = pl.DataFrame([{str.upper(columns[i]): str(v) for i, v in enumerate(rec)} for rec in recs])
df = df.group_by("CONCEPT").agg(
pl.col("CODE").first(),
pl.col("LABEL").first(),
pl.col("PATH").reverse(),
pl.col("SYNONYMS").drop_nulls(),
pl.col("TOPOGRAPHIE").first(),
pl.col("TYPE_ACTE").first(),
pl.col("MODE_ACCES").first(),
)
df.write_parquet("ccam.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment