Created
June 28, 2025 12:19
-
-
Save dridk/2cb015792bcf7db82cc7f9eb6f00f2fc to your computer and use it in GitHub Desktop.
extraction des code CCAM depuis un fichier RDF dans un fichier parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rdflib | |
import polars | |
# Chargement de la terminologie | |
g = rdflib.Graph() | |
g.parse("ccam.rdf") | |
# Requete SPARQL pour récupérer les variables d'interets | |
query = """ | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#> | |
PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
PREFIX ccam: <http://data.esante.gouv.fr/cnam/ccam/> | |
SELECT ?concept ?code ?label ?path ?synonyms ?topographie ?type_acte ?mode_acces | |
WHERE { | |
?concept rdfs:subClassOf* ccam:Acte . | |
?concept rdfs:label ?label. | |
?concept skos:notation ?code. | |
?concept rdfs:subClassOf+ ?superClass. | |
?superClass skos:notation ?path. | |
?concept ccam:topographie ?topographieConcept. | |
?topographieConcept rdfs:label ?topographie. | |
?concept ccam:typeActe ?typeActeConcept. | |
?typeActeConcept rdfs:label ?type_acte. | |
?concept ccam:modeAcces ?modeAccesConcept. | |
?modeAccesConcept rdfs:label ?mode_acces. | |
?concept ccam:action ?actionConcept. | |
?actionConcept rdfs:label ?action. | |
OPTIONAL { ?concept skos:altLabel ?synonyms. } | |
} | |
""" | |
# Execution de la requete SPARQL | |
records = g.query(sparql) | |
# Génération d'un dataframe pola.rs | |
columns = [str(i) for i in records.vars] | |
recs = [] | |
for rec in records: | |
if isinstance(rec, tuple): | |
recs.append(rec) | |
else: | |
raise TypeError("Records must contains iterable ") | |
df = pl.DataFrame([{str.upper(columns[i]): str(v) for i, v in enumerate(rec)} for rec in recs]) | |
df = df.group_by("CONCEPT").agg( | |
pl.col("CODE").first(), | |
pl.col("LABEL").first(), | |
pl.col("PATH").reverse(), | |
pl.col("SYNONYMS").drop_nulls(), | |
pl.col("TOPOGRAPHIE").first(), | |
pl.col("TYPE_ACTE").first(), | |
pl.col("MODE_ACCES").first(), | |
) | |
df.write_parquet("ccam.parquet") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment