Last active
December 5, 2024 18:40
-
-
Save jerowe/6e69055b10e070850c48b563a1b2a5e9 to your computer and use it in GitHub Desktop.
Opentargets data load and query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "6d2ab485-7cb4-43eb-9e85-ccdbe3ba57d5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[23:14:22] </span><span style=\"color: #808000; text-decoration-color: #808000\">WARNING </span> USER_AGENT environment variable not set, consider setting it to identify your <a href=\"file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">user_agent.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py#11\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">11</span></a>\n", | |
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> requests. <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[2;36m[23:14:22]\u001b[0m\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m USER_AGENT environment variable not set, consider setting it to identify your \u001b]8;id=700003;file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py\u001b\\\u001b[2muser_agent.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=431449;file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py#11\u001b\\\u001b[2m11\u001b[0m\u001b]8;;\u001b\\\n", | |
"\u001b[2;36m \u001b[0m requests. \u001b[2m \u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"\"\"\"Main module.\"\"\"\n", | |
"\n", | |
"import hashlib\n", | |
"import logging\n", | |
"import os\n", | |
"from typing import Optional, List, Dict, Any\n", | |
"import glob\n", | |
"import boto3\n", | |
"from toolz.itertoolz import partition_all\n", | |
"import pandas as pd\n", | |
"from langchain_core.documents.base import Document\n", | |
"\n", | |
"import funcy\n", | |
"import psycopg\n", | |
"from langchain_community.document_loaders import PyPDFLoader\n", | |
"from langchain_community.embeddings import (\n", | |
" BedrockEmbeddings,\n", | |
") # to create embeddings for the documents.\n", | |
"from langchain_postgres.vectorstores import PGVector\n", | |
"from langchain_text_splitters import CharacterTextSplitter\n", | |
"\n", | |
"import numpy as np\n", | |
"\n", | |
"from rich.logging import RichHandler\n", | |
"\n", | |
"from aws_bedrock_utilities.models.base import BedrockBase\n", | |
"from langchain_community.document_loaders import (\n", | |
" WebBaseLoader,\n", | |
" TextLoader,\n", | |
" PyPDFLoader,\n", | |
" CSVLoader,\n", | |
" Docx2txtLoader,\n", | |
" UnstructuredEPubLoader,\n", | |
" UnstructuredMarkdownLoader,\n", | |
" UnstructuredXMLLoader,\n", | |
" UnstructuredRSTLoader,\n", | |
" UnstructuredExcelLoader,\n", | |
" DataFrameLoader,\n", | |
")\n", | |
"from io import StringIO \n", | |
"import hashlib\n", | |
"import logging\n", | |
"import os\n", | |
"from typing import Optional, List, Dict, Any\n", | |
"import glob\n", | |
"import boto3\n", | |
"from toolz.itertoolz import partition_all\n", | |
"import pandas as pd\n", | |
"from langchain_core.documents.base import Document\n", | |
"\n", | |
"import funcy\n", | |
"import psycopg\n", | |
"from langchain_community.document_loaders import PyPDFLoader\n", | |
"from langchain_community.embeddings import (\n", | |
" BedrockEmbeddings,\n", | |
") # to create embeddings for the documents.\n", | |
"from langchain_postgres.vectorstores import PGVector\n", | |
"from langchain_text_splitters import CharacterTextSplitter\n", | |
"from rich.logging import RichHandler\n", | |
"\n", | |
"from aws_bedrock_utilities.models.base import BedrockBase\n", | |
"from langchain_community.document_loaders import (\n", | |
" WebBaseLoader,\n", | |
" TextLoader,\n", | |
" PyPDFLoader,\n", | |
" CSVLoader,\n", | |
" Docx2txtLoader,\n", | |
" UnstructuredEPubLoader,\n", | |
" UnstructuredMarkdownLoader,\n", | |
" UnstructuredXMLLoader,\n", | |
" UnstructuredRSTLoader,\n", | |
" UnstructuredExcelLoader,\n", | |
" DataFrameLoader,\n", | |
")\n", | |
"from langchain_text_splitters import CharacterTextSplitter\n", | |
"\n", | |
"\n", | |
"import logging\n", | |
"\n", | |
"from langchain.chains import create_retrieval_chain\n", | |
"from langchain.chains.combine_documents import create_stuff_documents_chain\n", | |
"from langchain.prompts import PromptTemplate\n", | |
"from langchain.retrievers.bedrock import (\n", | |
" AmazonKnowledgeBasesRetriever,\n", | |
" RetrievalConfig,\n", | |
" VectorSearchConfig,\n", | |
")\n", | |
"\n", | |
"from aws_bedrock_utilities.models.base import BedrockBase, RAGResults\n", | |
"from aws_bedrock_utilities.models.pgvector_knowledgebase import BedrockPGWrapper\n", | |
"\n", | |
"\n", | |
"FORMAT = \"%(message)s\"\n", | |
"logging.basicConfig(\n", | |
" level=\"INFO\", format=FORMAT, datefmt=\"[%X]\", handlers=[RichHandler()]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "5ba42076-1679-40a5-81ce-e2cd1006a50b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from aws_bedrock_utilities.models.pgvector_knowledgebase import BedrockPGWrapper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "a3cfb3de-03a6-4637-8dfd-208c61718113", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.environ['POSTGRES_USER'] = 'postgres'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "0bdeeb0d-6c5c-4c1a-8191-49e0376393b1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "95e43788-9a0e-469b-9367-1e835f230605", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Found credentials in environment variables. <a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1147</span></a>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Found credentials in environment variables. \u001b]8;id=767868;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=638063;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\u001b\\\u001b[2m1147\u001b[0m\u001b]8;;\u001b\\\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/lib/python3.11/site-packages/aws_bedrock_utilities/models/pgvector_knowledgebase.py:188: LangChainDeprecationWarning: The class `BedrockEmbeddings` was deprecated in LangChain 0.2.11 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-aws package and should be used instead. To use it run `pip install -U :class:`~langchain-aws` and import as `from :class:`~langchain_aws import BedrockEmbeddings``.\n", | |
" self.bedrock_embeddings = BedrockEmbeddings(\n" | |
] | |
} | |
], | |
"source": [ | |
"p = BedrockPGWrapper()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "ea8d4916-dd1f-4ac9-a2e5-9ca76344281f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[23:14:23] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Found credentials in environment variables. <a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1147</span></a>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[2;36m[23:14:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Found credentials in environment variables. \u001b]8;id=400520;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=61758;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\u001b\\\u001b[2m1147\u001b[0m\u001b]8;;\u001b\\\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"#collection_name = \"opentargets_targets\"\n", | |
"COLLECTION_NAME=\"opentargets\"\n", | |
"embeddings = BedrockEmbeddings(\n", | |
" model_id=\"amazon.titan-embed-text-v1\", \n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "723f54b9-a3fa-4e3d-b9b7-aa56635f41cd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"driver = \"psycopg2\"\n", | |
"user = os.environ.get(\"POSTGRES_USER\", \"postgres\")\n", | |
"password = os.environ.get(\"POSTGRES_PASSWORD\")\n", | |
"host = os.environ.get(\"POSTGRES_HOST\")\n", | |
"port = os.environ.get(\"POSTGRES_PORT\")\n", | |
"database = os.environ.get(\"POSTGRES_DB\")\n", | |
"connection = f\"postgresql+psycopg://{user}:{password}@{host}:{port}/{database}\"\n", | |
"# Establish the connection to the database\n", | |
"conn = psycopg.connect(\n", | |
" conninfo=f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n", | |
")\n", | |
"cursor = conn.cursor()\n", | |
"\n", | |
"\n", | |
"vectorstore = PGVector(\n", | |
" embeddings=embeddings,\n", | |
" collection_name=COLLECTION_NAME,\n", | |
" connection=connection,\n", | |
" use_jsonb=True,\n", | |
")\n", | |
"\n", | |
"\n", | |
"additional_metadata = {\"dataset\": \"Opentargets Targets\"}\n", | |
"page_content_column=\"id\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "8b0575eb-57fa-4d1a-8f27-feb6aa2024ff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = glob.glob(\"/home/jovyan/data/opentargets/diseases/*parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "cb2cda9a-680f-413a-a056-19ff8e6dadb7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_parquet(files[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "c1db85d6-0277-4823-b9a2-713ad9c5049b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>code</th>\n", | |
" <th>dbXRefs</th>\n", | |
" <th>description</th>\n", | |
" <th>name</th>\n", | |
" <th>directLocationIds</th>\n", | |
" <th>obsoleteTerms</th>\n", | |
" <th>parents</th>\n", | |
" <th>synonyms</th>\n", | |
" <th>ancestors</th>\n", | |
" <th>descendants</th>\n", | |
" <th>children</th>\n", | |
" <th>therapeuticAreas</th>\n", | |
" <th>indirectLocationIds</th>\n", | |
" <th>ontology</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>EFO_0001056</td>\n", | |
" <td>http://www.ebi.ac.uk/efo/EFO_0001056</td>\n", | |
" <td>[UMLS:C0023351, ICD9:030.1, DOID:1025, SCTID:7...</td>\n", | |
" <td>A principal or polar form of leprosy in which ...</td>\n", | |
" <td>tuberculoid leprosy</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0001054]</td>\n", | |
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n", | |
" <td>[EFO_0009387, OTAR_0000017, MONDO_0020590, EFO...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[OTAR_0000017, EFO_0005741, EFO_0000618]</td>\n", | |
" <td>None</td>\n", | |
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>EFO_0003830</td>\n", | |
" <td>http://www.ebi.ac.uk/efo/EFO_0003830</td>\n", | |
" <td>[NCIt:C26866, ICD9:601, DOID:14654, NCIT:C2686...</td>\n", | |
" <td>An infectious or non-infectious inflammatory p...</td>\n", | |
" <td>prostatitis</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0000771, EFO_0009602]</td>\n", | |
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n", | |
" <td>[EFO_0000512, OTAR_0000017, EFO_0009555, EFO_0...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[OTAR_0000017, EFO_0005741]</td>\n", | |
" <td>None</td>\n", | |
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>EFO_0005274</td>\n", | |
" <td>http://www.ebi.ac.uk/efo/EFO_0005274</td>\n", | |
" <td>[]</td>\n", | |
" <td>measurement of the time at which sleep begins</td>\n", | |
" <td>sleep time</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0004870]</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0001444, EFO_0004870]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[EFO_0001444]</td>\n", | |
" <td>None</td>\n", | |
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>EFO_0006857</td>\n", | |
" <td>http://www.ebi.ac.uk/efo/EFO_0006857</td>\n", | |
" <td>[MedDRA:10063094, UMLS:C0024534, MONDO:0005625...</td>\n", | |
" <td>Individuals with cerebral malaria frequently e...</td>\n", | |
" <td>cerebral malaria</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0001068, EFO_0005774, EFO_1001456]</td>\n", | |
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n", | |
" <td>[EFO_0005741, EFO_0005774, EFO_0001067, MONDO_...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[EFO_0005741, EFO_0001379, EFO_0000618, EFO_00...</td>\n", | |
" <td>None</td>\n", | |
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>EFO_0007619</td>\n", | |
" <td>http://www.ebi.ac.uk/efo/EFO_0007619</td>\n", | |
" <td>[]</td>\n", | |
" <td>quantification of suicide ideation</td>\n", | |
" <td>suicide ideation measurement</td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0006882]</td>\n", | |
" <td>None</td>\n", | |
" <td>[EFO_0006882, EFO_0001444, EFO_0006848]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[EFO_0001444]</td>\n", | |
" <td>None</td>\n", | |
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id code \\\n", | |
"0 EFO_0001056 http://www.ebi.ac.uk/efo/EFO_0001056 \n", | |
"1 EFO_0003830 http://www.ebi.ac.uk/efo/EFO_0003830 \n", | |
"2 EFO_0005274 http://www.ebi.ac.uk/efo/EFO_0005274 \n", | |
"3 EFO_0006857 http://www.ebi.ac.uk/efo/EFO_0006857 \n", | |
"4 EFO_0007619 http://www.ebi.ac.uk/efo/EFO_0007619 \n", | |
"\n", | |
" dbXRefs \\\n", | |
"0 [UMLS:C0023351, ICD9:030.1, DOID:1025, SCTID:7... \n", | |
"1 [NCIt:C26866, ICD9:601, DOID:14654, NCIT:C2686... \n", | |
"2 [] \n", | |
"3 [MedDRA:10063094, UMLS:C0024534, MONDO:0005625... \n", | |
"4 [] \n", | |
"\n", | |
" description \\\n", | |
"0 A principal or polar form of leprosy in which ... \n", | |
"1 An infectious or non-infectious inflammatory p... \n", | |
"2 measurement of the time at which sleep begins \n", | |
"3 Individuals with cerebral malaria frequently e... \n", | |
"4 quantification of suicide ideation \n", | |
"\n", | |
" name directLocationIds obsoleteTerms \\\n", | |
"0 tuberculoid leprosy None None \n", | |
"1 prostatitis None None \n", | |
"2 sleep time None None \n", | |
"3 cerebral malaria None None \n", | |
"4 suicide ideation measurement None None \n", | |
"\n", | |
" parents \\\n", | |
"0 [EFO_0001054] \n", | |
"1 [EFO_0000771, EFO_0009602] \n", | |
"2 [EFO_0004870] \n", | |
"3 [EFO_0001068, EFO_0005774, EFO_1001456] \n", | |
"4 [EFO_0006882] \n", | |
"\n", | |
" synonyms \\\n", | |
"0 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n", | |
"1 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n", | |
"2 None \n", | |
"3 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n", | |
"4 None \n", | |
"\n", | |
" ancestors descendants children \\\n", | |
"0 [EFO_0009387, OTAR_0000017, MONDO_0020590, EFO... [] [] \n", | |
"1 [EFO_0000512, OTAR_0000017, EFO_0009555, EFO_0... [] [] \n", | |
"2 [EFO_0001444, EFO_0004870] [] [] \n", | |
"3 [EFO_0005741, EFO_0005774, EFO_0001067, MONDO_... [] [] \n", | |
"4 [EFO_0006882, EFO_0001444, EFO_0006848] [] [] \n", | |
"\n", | |
" therapeuticAreas indirectLocationIds \\\n", | |
"0 [OTAR_0000017, EFO_0005741, EFO_0000618] None \n", | |
"1 [OTAR_0000017, EFO_0005741] None \n", | |
"2 [EFO_0001444] None \n", | |
"3 [EFO_0005741, EFO_0001379, EFO_0000618, EFO_00... None \n", | |
"4 [EFO_0001444] None \n", | |
"\n", | |
" ontology \n", | |
"0 {'isTherapeuticArea': False, 'leaf': True, 'so... \n", | |
"1 {'isTherapeuticArea': False, 'leaf': True, 'so... \n", | |
"2 {'isTherapeuticArea': False, 'leaf': True, 'so... \n", | |
"3 {'isTherapeuticArea': False, 'leaf': True, 'so... \n", | |
"4 {'isTherapeuticArea': False, 'leaf': True, 'so... " | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "6d2aaddd-5d53-46aa-8571-239871a7910d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#text = text.split(\"\\n\")\n", | |
"#text = list(filter(len, text))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "70ef07ca-02e0-4e1d-b121-f56d33a6592b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = df.replace(np.nan, None)\n", | |
"text = df.to_json(orient='records', lines=True)\n", | |
"text_splitter = CharacterTextSplitter(\n", | |
" separator=\"\\n\",\n", | |
" chunk_size=4000,\n", | |
" #chunk_overlap=200,\n", | |
" length_function=len,\n", | |
" is_separator_regex=False,\n", | |
")\n", | |
"texts = text_splitter.create_documents([text])\n", | |
"#print(texts[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "97a533a1-f155-44c9-93f9-40816c209c14", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment