Skip to content

Instantly share code, notes, and snippets.

@jerowe
Last active December 5, 2024 18:40
Show Gist options
  • Save jerowe/6e69055b10e070850c48b563a1b2a5e9 to your computer and use it in GitHub Desktop.
Save jerowe/6e69055b10e070850c48b563a1b2a5e9 to your computer and use it in GitHub Desktop.
Opentargets data load and query
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6d2ab485-7cb4-43eb-9e85-ccdbe3ba57d5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[23:14:22] </span><span style=\"color: #808000; text-decoration-color: #808000\">WARNING </span> USER_AGENT environment variable not set, consider setting it to identify your <a href=\"file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">user_agent.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py#11\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">11</span></a>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> requests. <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[23:14:22]\u001b[0m\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m USER_AGENT environment variable not set, consider setting it to identify your \u001b]8;id=700003;file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py\u001b\\\u001b[2muser_agent.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=431449;file:///opt/conda/lib/python3.11/site-packages/langchain_community/utils/user_agent.py#11\u001b\\\u001b[2m11\u001b[0m\u001b]8;;\u001b\\\n",
"\u001b[2;36m \u001b[0m requests. \u001b[2m \u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\"\"\"Main module.\"\"\"\n",
"\n",
"import hashlib\n",
"import logging\n",
"import os\n",
"from typing import Optional, List, Dict, Any\n",
"import glob\n",
"import boto3\n",
"from toolz.itertoolz import partition_all\n",
"import pandas as pd\n",
"from langchain_core.documents.base import Document\n",
"\n",
"import funcy\n",
"import psycopg\n",
"from langchain_community.document_loaders import PyPDFLoader\n",
"from langchain_community.embeddings import (\n",
" BedrockEmbeddings,\n",
") # to create embeddings for the documents.\n",
"from langchain_postgres.vectorstores import PGVector\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"import numpy as np\n",
"\n",
"from rich.logging import RichHandler\n",
"\n",
"from aws_bedrock_utilities.models.base import BedrockBase\n",
"from langchain_community.document_loaders import (\n",
" WebBaseLoader,\n",
" TextLoader,\n",
" PyPDFLoader,\n",
" CSVLoader,\n",
" Docx2txtLoader,\n",
" UnstructuredEPubLoader,\n",
" UnstructuredMarkdownLoader,\n",
" UnstructuredXMLLoader,\n",
" UnstructuredRSTLoader,\n",
" UnstructuredExcelLoader,\n",
" DataFrameLoader,\n",
")\n",
"from io import StringIO \n",
"import hashlib\n",
"import logging\n",
"import os\n",
"from typing import Optional, List, Dict, Any\n",
"import glob\n",
"import boto3\n",
"from toolz.itertoolz import partition_all\n",
"import pandas as pd\n",
"from langchain_core.documents.base import Document\n",
"\n",
"import funcy\n",
"import psycopg\n",
"from langchain_community.document_loaders import PyPDFLoader\n",
"from langchain_community.embeddings import (\n",
" BedrockEmbeddings,\n",
") # to create embeddings for the documents.\n",
"from langchain_postgres.vectorstores import PGVector\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"from rich.logging import RichHandler\n",
"\n",
"from aws_bedrock_utilities.models.base import BedrockBase\n",
"from langchain_community.document_loaders import (\n",
" WebBaseLoader,\n",
" TextLoader,\n",
" PyPDFLoader,\n",
" CSVLoader,\n",
" Docx2txtLoader,\n",
" UnstructuredEPubLoader,\n",
" UnstructuredMarkdownLoader,\n",
" UnstructuredXMLLoader,\n",
" UnstructuredRSTLoader,\n",
" UnstructuredExcelLoader,\n",
" DataFrameLoader,\n",
")\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"\n",
"import logging\n",
"\n",
"from langchain.chains import create_retrieval_chain\n",
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.retrievers.bedrock import (\n",
" AmazonKnowledgeBasesRetriever,\n",
" RetrievalConfig,\n",
" VectorSearchConfig,\n",
")\n",
"\n",
"from aws_bedrock_utilities.models.base import BedrockBase, RAGResults\n",
"from aws_bedrock_utilities.models.pgvector_knowledgebase import BedrockPGWrapper\n",
"\n",
"\n",
"FORMAT = \"%(message)s\"\n",
"logging.basicConfig(\n",
" level=\"INFO\", format=FORMAT, datefmt=\"[%X]\", handlers=[RichHandler()]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5ba42076-1679-40a5-81ce-e2cd1006a50b",
"metadata": {},
"outputs": [],
"source": [
"from aws_bedrock_utilities.models.pgvector_knowledgebase import BedrockPGWrapper"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a3cfb3de-03a6-4637-8dfd-208c61718113",
"metadata": {},
"outputs": [],
"source": [
"os.environ['POSTGRES_USER'] = 'postgres'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0bdeeb0d-6c5c-4c1a-8191-49e0376393b1",
"metadata": {},
"outputs": [],
"source": [
"os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "95e43788-9a0e-469b-9367-1e835f230605",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Found credentials in environment variables. <a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1147</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Found credentials in environment variables. \u001b]8;id=767868;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=638063;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\u001b\\\u001b[2m1147\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.11/site-packages/aws_bedrock_utilities/models/pgvector_knowledgebase.py:188: LangChainDeprecationWarning: The class `BedrockEmbeddings` was deprecated in LangChain 0.2.11 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-aws package and should be used instead. To use it run `pip install -U :class:`~langchain-aws` and import as `from :class:`~langchain_aws import BedrockEmbeddings``.\n",
" self.bedrock_embeddings = BedrockEmbeddings(\n"
]
}
],
"source": [
"p = BedrockPGWrapper()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ea8d4916-dd1f-4ac9-a2e5-9ca76344281f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[23:14:23] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Found credentials in environment variables. <a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1147</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[23:14:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Found credentials in environment variables. \u001b]8;id=400520;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=61758;file:///opt/conda/lib/python3.11/site-packages/botocore/credentials.py#1147\u001b\\\u001b[2m1147\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#collection_name = \"opentargets_targets\"\n",
"COLLECTION_NAME=\"opentargets\"\n",
"embeddings = BedrockEmbeddings(\n",
" model_id=\"amazon.titan-embed-text-v1\", \n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "723f54b9-a3fa-4e3d-b9b7-aa56635f41cd",
"metadata": {},
"outputs": [],
"source": [
"driver = \"psycopg2\"\n",
"user = os.environ.get(\"POSTGRES_USER\", \"postgres\")\n",
"password = os.environ.get(\"POSTGRES_PASSWORD\")\n",
"host = os.environ.get(\"POSTGRES_HOST\")\n",
"port = os.environ.get(\"POSTGRES_PORT\")\n",
"database = os.environ.get(\"POSTGRES_DB\")\n",
"connection = f\"postgresql+psycopg://{user}:{password}@{host}:{port}/{database}\"\n",
"# Establish the connection to the database\n",
"conn = psycopg.connect(\n",
" conninfo=f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n",
")\n",
"cursor = conn.cursor()\n",
"\n",
"\n",
"vectorstore = PGVector(\n",
" embeddings=embeddings,\n",
" collection_name=COLLECTION_NAME,\n",
" connection=connection,\n",
" use_jsonb=True,\n",
")\n",
"\n",
"\n",
"additional_metadata = {\"dataset\": \"Opentargets Targets\"}\n",
"page_content_column=\"id\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8b0575eb-57fa-4d1a-8f27-feb6aa2024ff",
"metadata": {},
"outputs": [],
"source": [
"files = glob.glob(\"/home/jovyan/data/opentargets/diseases/*parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cb2cda9a-680f-413a-a056-19ff8e6dadb7",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_parquet(files[0])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c1db85d6-0277-4823-b9a2-713ad9c5049b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>code</th>\n",
" <th>dbXRefs</th>\n",
" <th>description</th>\n",
" <th>name</th>\n",
" <th>directLocationIds</th>\n",
" <th>obsoleteTerms</th>\n",
" <th>parents</th>\n",
" <th>synonyms</th>\n",
" <th>ancestors</th>\n",
" <th>descendants</th>\n",
" <th>children</th>\n",
" <th>therapeuticAreas</th>\n",
" <th>indirectLocationIds</th>\n",
" <th>ontology</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>EFO_0001056</td>\n",
" <td>http://www.ebi.ac.uk/efo/EFO_0001056</td>\n",
" <td>[UMLS:C0023351, ICD9:030.1, DOID:1025, SCTID:7...</td>\n",
" <td>A principal or polar form of leprosy in which ...</td>\n",
" <td>tuberculoid leprosy</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>[EFO_0001054]</td>\n",
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n",
" <td>[EFO_0009387, OTAR_0000017, MONDO_0020590, EFO...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[OTAR_0000017, EFO_0005741, EFO_0000618]</td>\n",
" <td>None</td>\n",
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>EFO_0003830</td>\n",
" <td>http://www.ebi.ac.uk/efo/EFO_0003830</td>\n",
" <td>[NCIt:C26866, ICD9:601, DOID:14654, NCIT:C2686...</td>\n",
" <td>An infectious or non-infectious inflammatory p...</td>\n",
" <td>prostatitis</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>[EFO_0000771, EFO_0009602]</td>\n",
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n",
" <td>[EFO_0000512, OTAR_0000017, EFO_0009555, EFO_0...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[OTAR_0000017, EFO_0005741]</td>\n",
" <td>None</td>\n",
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>EFO_0005274</td>\n",
" <td>http://www.ebi.ac.uk/efo/EFO_0005274</td>\n",
" <td>[]</td>\n",
" <td>measurement of the time at which sleep begins</td>\n",
" <td>sleep time</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>[EFO_0004870]</td>\n",
" <td>None</td>\n",
" <td>[EFO_0001444, EFO_0004870]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[EFO_0001444]</td>\n",
" <td>None</td>\n",
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>EFO_0006857</td>\n",
" <td>http://www.ebi.ac.uk/efo/EFO_0006857</td>\n",
" <td>[MedDRA:10063094, UMLS:C0024534, MONDO:0005625...</td>\n",
" <td>Individuals with cerebral malaria frequently e...</td>\n",
" <td>cerebral malaria</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>[EFO_0001068, EFO_0005774, EFO_1001456]</td>\n",
" <td>{'hasBroadSynonym': None, 'hasExactSynonym': [...</td>\n",
" <td>[EFO_0005741, EFO_0005774, EFO_0001067, MONDO_...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[EFO_0005741, EFO_0001379, EFO_0000618, EFO_00...</td>\n",
" <td>None</td>\n",
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>EFO_0007619</td>\n",
" <td>http://www.ebi.ac.uk/efo/EFO_0007619</td>\n",
" <td>[]</td>\n",
" <td>quantification of suicide ideation</td>\n",
" <td>suicide ideation measurement</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>[EFO_0006882]</td>\n",
" <td>None</td>\n",
" <td>[EFO_0006882, EFO_0001444, EFO_0006848]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[EFO_0001444]</td>\n",
" <td>None</td>\n",
" <td>{'isTherapeuticArea': False, 'leaf': True, 'so...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id code \\\n",
"0 EFO_0001056 http://www.ebi.ac.uk/efo/EFO_0001056 \n",
"1 EFO_0003830 http://www.ebi.ac.uk/efo/EFO_0003830 \n",
"2 EFO_0005274 http://www.ebi.ac.uk/efo/EFO_0005274 \n",
"3 EFO_0006857 http://www.ebi.ac.uk/efo/EFO_0006857 \n",
"4 EFO_0007619 http://www.ebi.ac.uk/efo/EFO_0007619 \n",
"\n",
" dbXRefs \\\n",
"0 [UMLS:C0023351, ICD9:030.1, DOID:1025, SCTID:7... \n",
"1 [NCIt:C26866, ICD9:601, DOID:14654, NCIT:C2686... \n",
"2 [] \n",
"3 [MedDRA:10063094, UMLS:C0024534, MONDO:0005625... \n",
"4 [] \n",
"\n",
" description \\\n",
"0 A principal or polar form of leprosy in which ... \n",
"1 An infectious or non-infectious inflammatory p... \n",
"2 measurement of the time at which sleep begins \n",
"3 Individuals with cerebral malaria frequently e... \n",
"4 quantification of suicide ideation \n",
"\n",
" name directLocationIds obsoleteTerms \\\n",
"0 tuberculoid leprosy None None \n",
"1 prostatitis None None \n",
"2 sleep time None None \n",
"3 cerebral malaria None None \n",
"4 suicide ideation measurement None None \n",
"\n",
" parents \\\n",
"0 [EFO_0001054] \n",
"1 [EFO_0000771, EFO_0009602] \n",
"2 [EFO_0004870] \n",
"3 [EFO_0001068, EFO_0005774, EFO_1001456] \n",
"4 [EFO_0006882] \n",
"\n",
" synonyms \\\n",
"0 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n",
"1 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n",
"2 None \n",
"3 {'hasBroadSynonym': None, 'hasExactSynonym': [... \n",
"4 None \n",
"\n",
" ancestors descendants children \\\n",
"0 [EFO_0009387, OTAR_0000017, MONDO_0020590, EFO... [] [] \n",
"1 [EFO_0000512, OTAR_0000017, EFO_0009555, EFO_0... [] [] \n",
"2 [EFO_0001444, EFO_0004870] [] [] \n",
"3 [EFO_0005741, EFO_0005774, EFO_0001067, MONDO_... [] [] \n",
"4 [EFO_0006882, EFO_0001444, EFO_0006848] [] [] \n",
"\n",
" therapeuticAreas indirectLocationIds \\\n",
"0 [OTAR_0000017, EFO_0005741, EFO_0000618] None \n",
"1 [OTAR_0000017, EFO_0005741] None \n",
"2 [EFO_0001444] None \n",
"3 [EFO_0005741, EFO_0001379, EFO_0000618, EFO_00... None \n",
"4 [EFO_0001444] None \n",
"\n",
" ontology \n",
"0 {'isTherapeuticArea': False, 'leaf': True, 'so... \n",
"1 {'isTherapeuticArea': False, 'leaf': True, 'so... \n",
"2 {'isTherapeuticArea': False, 'leaf': True, 'so... \n",
"3 {'isTherapeuticArea': False, 'leaf': True, 'so... \n",
"4 {'isTherapeuticArea': False, 'leaf': True, 'so... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "6d2aaddd-5d53-46aa-8571-239871a7910d",
"metadata": {},
"outputs": [],
"source": [
"#text = text.split(\"\\n\")\n",
"#text = list(filter(len, text))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "70ef07ca-02e0-4e1d-b121-f56d33a6592b",
"metadata": {},
"outputs": [],
"source": [
"df = df.replace(np.nan, None)\n",
"text = df.to_json(orient='records', lines=True)\n",
"text_splitter = CharacterTextSplitter(\n",
" separator=\"\\n\",\n",
" chunk_size=4000,\n",
" #chunk_overlap=200,\n",
" length_function=len,\n",
" is_separator_regex=False,\n",
")\n",
"texts = text_splitter.create_documents([text])\n",
"#print(texts[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97a533a1-f155-44c9-93f9-40816c209c14",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment