Created
April 3, 2018 15:35
-
-
Save ricalanis/b663c80889746cc29cbee57e28419ded to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from selenium.webdriver.common.keys import Keys\n", | |
"from bs4 import BeautifulSoup\n", | |
"import requests\n", | |
"import pandas\n", | |
"import simplejson" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"driver = webdriver.Chrome()\n", | |
"driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/consultaextra.asp\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"button_start = driver.find_element_by_name(\"Image1\")\n", | |
"button_start.click()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_links(driver):\n", | |
" try:\n", | |
" links = [link[\"href\"].replace(\"verextraviado2.asp?\",\"\") for link in BeautifulSoup(driver.page_source).find_all(\"table\")[2].find_all(\"a\")]\n", | |
" output_total = []\n", | |
" for link in links:\n", | |
" output = {}\n", | |
" for value in link.split(\"&\"):\n", | |
" try:\n", | |
" output[value.split(\"=\")[0]]=value.split(\"=\")[1]\n", | |
" except:\n", | |
" next\n", | |
" output_total.append(output)\n", | |
" except:\n", | |
" next\n", | |
" return output_total" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | |
"\n", | |
"The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n", | |
"\n", | |
" BeautifulSoup([your markup])\n", | |
"\n", | |
"to this:\n", | |
"\n", | |
" BeautifulSoup([your markup], \"lxml\")\n", | |
"\n", | |
" markup_type=markup_type))\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1\n", | |
"2\n", | |
"3\n", | |
"4\n", | |
"5\n", | |
"6\n", | |
"7\n", | |
"8\n", | |
"9\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"14\n", | |
"15\n", | |
"16\n", | |
"17\n", | |
"18\n", | |
"19\n", | |
"20\n", | |
"21\n", | |
"22\n", | |
"23\n", | |
"24\n", | |
"25\n", | |
"26\n", | |
"27\n", | |
"28\n", | |
"29\n", | |
"30\n", | |
"31\n", | |
"32\n", | |
"33\n", | |
"34\n", | |
"35\n", | |
"36\n", | |
"37\n", | |
"38\n", | |
"39\n", | |
"40\n", | |
"41\n", | |
"42\n", | |
"43\n", | |
"44\n", | |
"45\n", | |
"46\n", | |
"47\n", | |
"48\n", | |
"49\n", | |
"50\n", | |
"51\n", | |
"52\n", | |
"53\n", | |
"54\n", | |
"55\n", | |
"56\n", | |
"57\n", | |
"58\n", | |
"59\n", | |
"60\n", | |
"61\n", | |
"62\n", | |
"63\n", | |
"64\n", | |
"65\n", | |
"66\n", | |
"67\n", | |
"68\n", | |
"69\n", | |
"70\n" | |
] | |
} | |
], | |
"source": [ | |
"output = []\n", | |
"for i in range(1,71):\n", | |
" driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/vernomextra.asp?pagina=\"+str(i))\n", | |
" output = output + get_links(driver)\n", | |
" print(str(i))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | |
"\n", | |
"The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n", | |
"\n", | |
" BeautifulSoup([your markup])\n", | |
"\n", | |
"to this:\n", | |
"\n", | |
" BeautifulSoup([your markup], \"lxml\")\n", | |
"\n", | |
" markup_type=markup_type))\n" | |
] | |
} | |
], | |
"source": [ | |
"data = get_links(driver)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.DataFrame(output).to_csv(\"Locatel_EDOMEX.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"11112017-ricalanis-CNS-mujeresyniñasdesaparecidas.ipynb\r\n", | |
"AMBER_Guanajuato.csv\r\n", | |
"Alerta AMBER Guanajuato.ipynb\r\n", | |
"\u001b[1m\u001b[36mBases de Datos Finales\u001b[m\u001b[m\r\n", | |
"CodigoFinal-Copy1.ipynb\r\n", | |
"CodigoFinal.ipynb\r\n", | |
"Distribuciondeañosporbase.csv\r\n", | |
"Locatel.ipynb\r\n", | |
"Locatel_EDOMEX.csv\r\n", | |
"MayoresMenoresdeEdad.xlsx\r\n", | |
"Mujeres_Comun.csv\r\n", | |
"Mujeres_Comun.csv.bak\r\n", | |
"OGPMiniGrant.docx\r\n", | |
"PGJ.ipynb\r\n", | |
"PorcentajedeMayoresdeEdad.csv\r\n", | |
"PorcentajesPorAño.xlsx\r\n", | |
"RNPRED Explore.ipynb\r\n", | |
"Scrapper_RNPED.ipynb\r\n", | |
"Untitled.ipynb\r\n", | |
"datos_mayores.csv\r\n", | |
"datos_menores.csv\r\n", | |
"\u001b[1m\u001b[36mfierbois\u001b[m\u001b[m\r\n", | |
"google.png\r\n", | |
"\u001b[1m\u001b[36mimages\u001b[m\u001b[m\r\n", | |
"mujeres_desaparecidas_join.csv\r\n", | |
"output_tables.txt\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment