Last active
June 28, 2023 09:37
-
-
Save jjerphan/8c532ec65ed6e2110df0620786dcfa4f to your computer and use it in GitHub Desktop.
pyarrow to Arrow C Data API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"id": "30103ds0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!pip install cffi" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "47699ffe", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"import pyarrow as pa\n", | |
"\n", | |
"from pyarrow.cffi import ffi\n", | |
"\n", | |
"NUM_COLUMNS=11\n", | |
"NUM_ROWS=7\n", | |
"\n", | |
"df = pd.DataFrame(\n", | |
" np.random.randint(0, 100, size=(NUM_ROWS, NUM_COLUMNS)),\n", | |
" columns=[f\"COL_{i}\" for i in range(NUM_COLUMNS)],\n", | |
" index=pd.date_range('2000', periods=NUM_ROWS, freq='h'),\n", | |
" dtype=\"float32[pyarrow]\",\n", | |
" \n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "b1e0ea40", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Pandas wrappers of several `pyarrow.lib.Array`s\n", | |
"arrow_extension_arrays = df._mgr.arrays\n", | |
"\n", | |
"# Likely a list of `pyarrow.lib.FloatArray` (a subclass of `pyarrow.lib.Array`)\n", | |
"# Depending on the dtype, we might have other `pyarrow.lib.*Array` extending `pyarrow.lib.Array`\n", | |
"#\n", | |
"# We need to call `combine_chunks` because `pyarrow.lib.ChunkArray` have (for now) no way to\n", | |
"# export or import data (e.g. with `_import_from_c`/`_export_to_c`).\n", | |
"pyarrow_arrays = list(map(lambda array: array._data.combine_chunks(), arrow_extension_arrays))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "e625ffd3", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[<pyarrow.lib.FloatArray object at 0x7fd645c57e80>\n", | |
" [\n", | |
" 81,\n", | |
" 14,\n", | |
" 11,\n", | |
" 98,\n", | |
" 27,\n", | |
" 39,\n", | |
" 22\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645c57fa0>\n", | |
" [\n", | |
" 38,\n", | |
" 67,\n", | |
" 11,\n", | |
" 30,\n", | |
" 30,\n", | |
" 61,\n", | |
" 45\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca4040>\n", | |
" [\n", | |
" 67,\n", | |
" 46,\n", | |
" 21,\n", | |
" 59,\n", | |
" 88,\n", | |
" 79,\n", | |
" 41\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca40a0>\n", | |
" [\n", | |
" 9,\n", | |
" 6,\n", | |
" 7,\n", | |
" 87,\n", | |
" 93,\n", | |
" 69,\n", | |
" 9\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd6d651b700>\n", | |
" [\n", | |
" 21,\n", | |
" 46,\n", | |
" 90,\n", | |
" 76,\n", | |
" 4,\n", | |
" 12,\n", | |
" 80\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca4160>\n", | |
" [\n", | |
" 99,\n", | |
" 36,\n", | |
" 20,\n", | |
" 88,\n", | |
" 2,\n", | |
" 89,\n", | |
" 80\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca4220>\n", | |
" [\n", | |
" 4,\n", | |
" 97,\n", | |
" 0,\n", | |
" 68,\n", | |
" 57,\n", | |
" 77,\n", | |
" 85\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca4280>\n", | |
" [\n", | |
" 56,\n", | |
" 64,\n", | |
" 96,\n", | |
" 9,\n", | |
" 55,\n", | |
" 87,\n", | |
" 33\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca42e0>\n", | |
" [\n", | |
" 92,\n", | |
" 79,\n", | |
" 83,\n", | |
" 8,\n", | |
" 35,\n", | |
" 47,\n", | |
" 90\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca4340>\n", | |
" [\n", | |
" 97,\n", | |
" 94,\n", | |
" 37,\n", | |
" 94,\n", | |
" 34,\n", | |
" 56,\n", | |
" 24\n", | |
" ],\n", | |
" <pyarrow.lib.FloatArray object at 0x7fd645ca43a0>\n", | |
" [\n", | |
" 60,\n", | |
" 45,\n", | |
" 41,\n", | |
" 16,\n", | |
" 22,\n", | |
" 18,\n", | |
" 46\n", | |
" ]]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pyarrow_arrays" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "efa0aed2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"first_pyarrow_array = pyarrow_arrays[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "ccd49550", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Help on built-in function _export_to_c:\n", | |
"\n", | |
"_export_to_c(...) method of pyarrow.lib.DataType instance\n", | |
" DataType._export_to_c(self, out_ptr)\n", | |
" \n", | |
" Export to a C ArrowSchema struct, given its pointer.\n", | |
" \n", | |
" Be careful: if you don't pass the ArrowSchema struct to a consumer,\n", | |
" its memory will leak. This is a low-level function intended for\n", | |
" expert users.\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"help(first_pyarrow_array.type._export_to_c)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "43049565", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Help on built-in function _export_to_c:\n", | |
"\n", | |
"_export_to_c(...) method of pyarrow.lib.FloatArray instance\n", | |
" Array._export_to_c(self, out_ptr, out_schema_ptr=0)\n", | |
" \n", | |
" Export to a C ArrowArray struct, given its pointer.\n", | |
" \n", | |
" If a C ArrowSchema struct pointer is also given, the array type\n", | |
" is exported to it at the same time.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" out_ptr: int\n", | |
" The raw pointer to a C ArrowArray struct.\n", | |
" out_schema_ptr: int (optional)\n", | |
" The raw pointer to a C ArrowSchema struct.\n", | |
" \n", | |
" Be careful: if you don't pass the ArrowArray struct to a consumer,\n", | |
" array memory will leak. This is a low-level function intended for\n", | |
" expert users.\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"help(first_pyarrow_array._export_to_c)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "b061cb6f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Opaque pointers via cffi, usable in C and C++\n", | |
"# See: https://cffi.readthedocs.io/en/latest/using.html#working-with-pointers-structures-and-arrays\n", | |
"c_schema = ffi.new(\"struct ArrowSchema*\")\n", | |
"c_schema_ptr = int(ffi.cast(\"uintptr_t\", c_schema))\n", | |
"\n", | |
"c_array = ffi.new(\"struct ArrowArray*\")\n", | |
"c_array_ptr = int(ffi.cast(\"uintptr_t\", c_array))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "ee41451a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Populate opaque pointers\n", | |
"first_pyarrow_array.type._export_to_c(c_schema_ptr)\n", | |
"first_pyarrow_array._export_to_c(c_array_ptr)\n", | |
"\n", | |
"# Deserialize schema\n", | |
"deserialized_schema = pa.DataType._import_from_c(c_schema_ptr)\n", | |
"\n", | |
"# Deserialize array\n", | |
"deserialized_first_pyarrow_array = pa.Array._import_from_c(c_array_ptr, deserialized_schema)\n", | |
"\n", | |
"assert deserialized_first_pyarrow_array.equals(first_pyarrow_array)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cb4f412c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment