Created
April 3, 2018 15:35
-
-
Save ricalanis/b663c80889746cc29cbee57e28419ded to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from selenium.webdriver.common.keys import Keys\n", | |
"from bs4 import BeautifulSoup\n", | |
"import requests\n", | |
"import pandas\n", | |
"import simplejson" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def clean_string(input_string):\n", | |
" input_string =str(input_string)\n", | |
" input_string =input_string.replace(\"\\xa0\",\"\")\n", | |
" input_string =input_string.replace(\"\\r\",\"\")\n", | |
" input_string =input_string.replace(\"\\n\",\"\")\n", | |
" input_string =input_string.replace(\"<label>\",\"\")\n", | |
" input_string =input_string.replace(\"</label>\",\"\")\n", | |
" input_string =input_string.replace(\"</td>\",\"\")\n", | |
" input_string =input_string.replace(\"</span>\",\"\")\n", | |
" input_string =input_string.strip()\n", | |
" lista_string = input_string.split(':') \n", | |
" return lista_string[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_page(url):\n", | |
" request_pagina = requests.get(url)\n", | |
" soup_pagina = BeautifulSoup(request_pagina.text)\n", | |
" data =extract_data(soup_pagina)\n", | |
" return data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_data(soup_pagina):\n", | |
" page_table = soup_pagina.find_all('table')\n", | |
" td_table = page_table[5].find_all('td')\n", | |
" i = 4\n", | |
" lista_respuestas = []\n", | |
" while i < 24:\n", | |
" lista_respuestas.append(clean_string(td_table[i]))\n", | |
" i = i + 1\n", | |
" return lista_respuestas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_links(soup):\n", | |
" links_pagina = []\n", | |
" for link in soup.find_all('a'):\n", | |
" direccion_link = link.get('href',None)\n", | |
" try: \n", | |
" if \"Extra_FlowController_1id\" in direccion_link: \n", | |
" if direccion_link not in links_pagina:\n", | |
" links_pagina.append(direccion_link)\n", | |
" except:\n", | |
" print(\"\")\n", | |
" return links_pagina" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | |
"\n", | |
"The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n", | |
"\n", | |
" BeautifulSoup([your markup])\n", | |
"\n", | |
"to this:\n", | |
"\n", | |
" BeautifulSoup([your markup], \"lxml\")\n", | |
"\n", | |
" markup_type=markup_type))\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"driver = webdriver.Chrome()\n", | |
"driver.get(\"http://www.cns.gob.mx/extraviadosWeb/portals/extraviados.portal\")\n", | |
"age_element = driver.find_element_by_name(\"Extra_FlowController_1wlw-select_key:{actionForm.edad}\")\n", | |
"age_element.send_keys(\"Me\")\n", | |
"search_element =driver.find_element_by_name(\"Submit\")\n", | |
"search_element.click()\n", | |
"soup=BeautifulSoup(driver.page_source,\"lxml\")\n", | |
"links_pagina = get_links(soup)\n", | |
"lista_registros = []\n", | |
"while len(links_pagina)>0:\n", | |
" for link in links_pagina:\n", | |
" data = extract_page(link)\n", | |
" lista_registros.append(data)\n", | |
" driver.get(\"http://www.cns.gob.mx:80/extraviadosWeb/portals/extraviados.portal?_nfpb=true&_st=&_windowLabel=Extra_FlowController_1&Extra_FlowController_1_actionOverride=%2FConsulta%2FExtra_Flow%2Fsiguientes\")\n", | |
" soup=BeautifulSoup(driver.page_source,\"lxml\")\n", | |
" links_pagina = get_links(soup)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['BAUTISTAALBERTO ARIANA',\n", | |
" '15/07/2001',\n", | |
" ' 15 años',\n", | |
" ' 154 cms.',\n", | |
" ' 50',\n", | |
" ' MEDIANA',\n", | |
" ' MORENA CLARA',\n", | |
" ' REDONDA GRANDE',\n", | |
" ' MEDIANOS RASGADOS CAFÉ OBSCURO',\n", | |
" ' ABUNDANTE CASTAÑO CLARO QUEBRADO LARGO',\n", | |
" ' GRANDE',\n", | |
" ' CHATA ANCHA MEDIANA',\n", | |
" ' REGULAR',\n", | |
" ' ESCASAS HACIA ABAJO CASTAÑO',\n", | |
" ' ROSAS MEDIANOS',\n", | |
" ' REDONDO MEDIANO',\n", | |
" '30/06/2017',\n", | |
" ' ECATEPEC ESTADO DE MEXICO',\n", | |
" 'EL 30 DE JUNIO DE 2017, LA ADOLESCENTE ARIANA BAUTISTA ALBERTO, SALIÓ DE SU DOMICILIO UBICADO EN ECATEPEC, ESTADO DE MÉXICO, SIN QUE HASTA EL MOMENTO SE TENGA CONOCIMIENTO DE SU PARADERO. DE LAS INVESTIGACIONES SE DESPRENDE QUE VIAJA EN COMPAÑÍA DE SINAÍ GARCÍA MARTÍNEZ. SE CONSIDERA QUE LA INTEGRIDAD DE LAS ADOLESCENTES SE ENCUENTRA EN RIESGO TODA VEZ QUE PUEDEN SER VÍCTIMAS DE LA COMISIÓN DE UN DELITO.',\n", | |
" '']" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lista_registros[10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"columnas = ['Nombre',\n", | |
" 'Fecha de Nacimiento',\n", | |
" ' Edad',\n", | |
" 'Altura.',\n", | |
" 'Peso (kg)',\n", | |
" 'Complexión',\n", | |
" 'Tez',\n", | |
" 'Cara',\n", | |
" 'Ojos',\n", | |
" 'Cabello',\n", | |
" 'Beca',\n", | |
" 'Nariz',\n", | |
" 'Boca',\n", | |
" 'Cejas',\n", | |
" 'Labios',\n", | |
" 'Mentón',\n", | |
" 'Fecha de Desaparición',\n", | |
" 'Lugar de desaparición',\n", | |
" 'Hechos',\n", | |
" 'ETC']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pandas.DataFrame(lista_registros, columns = columnas).to_csv(\"datos_menores.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"2 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"3 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"4 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"5 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"6 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"7 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"8 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"9 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"10 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"11 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"12 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"13 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"14 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"15 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"16 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"17 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"18 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"19 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"20 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"21 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"22 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"23 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"24 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"25 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"26 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"27 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"28 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"29 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
" ... \n", | |
"994 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"995 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"996 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"997 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"998 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"999 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1000 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1001 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1002 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1003 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1004 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1005 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1006 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1007 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1008 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1009 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1010 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1011 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1012 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1013 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1014 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1015 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1016 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1017 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1018 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1019 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1020 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"1021 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1022 http://www.cns.gob.mx/extraviadosWeb/showImage...\n", | |
"1023 http://www.cns.gob.mx/extraviadosWeb/Consulta/...\n", | |
"Name: URL, Length: 1024, dtype: object" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pandas.read_csv(\"datos_mayores.csv\")[\"URL\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import shutil\n", | |
"\n", | |
"import requests\n", | |
"\n", | |
"\n", | |
"def save_image(name, url):\n", | |
" response = requests.get(url, stream=True)\n", | |
" with open(\"images/\"+name+\".gif\", 'wb') as out_file:\n", | |
" shutil.copyfileobj(response.raw, out_file)\n", | |
" del response" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import slugify" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"datos_mayores = pandas.read_csv(\"datos_mayores.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for i, datum in datos_mayores.iterrows():\n", | |
" if \"silueta2\" not in datum[\"URL\"]:\n", | |
" save_image(slugify.slugify(datum[\"Nombre\"]), datum[\"URL\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def clean_string(input_string):\n", | |
" input_string =str(input_string)\n", | |
" input_string =input_string.replace(\"\\xa0\",\"\")\n", | |
" input_string =input_string.replace(\"\\r\",\"\")\n", | |
" input_string =input_string.replace(\"\\n\",\"\")\n", | |
" input_string =input_string.replace(\"<label>\",\"\")\n", | |
" input_string =input_string.replace(\"</label>\",\"\")\n", | |
" input_string =input_string.replace(\"</td>\",\"\")\n", | |
" input_string =input_string.replace(\"</span>\",\"\")\n", | |
" input_string =input_string.strip()\n", | |
" lista_string = input_string.split(':') \n", | |
" return lista_string[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_page(url):\n", | |
" request_pagina = requests.get(url)\n", | |
" soup_pagina = BeautifulSoup(request_pagina.text)\n", | |
" data =extract_data(soup_pagina)\n", | |
" return data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_data(soup_pagina):\n", | |
" page_table = soup_pagina.find_all('table')\n", | |
" td_table = page_table[5].find_all('td')\n", | |
" i = 4\n", | |
" lista_respuestas = []\n", | |
" while i < 24:\n", | |
" lista_respuestas.append(clean_string(td_table[i]))\n", | |
" i = i + 1\n", | |
" imgs= soup_pagina.find_all('img')\n", | |
" girl_photo = \"http://www.cns.gob.mx\"+soup_pagina.find_all('img')[37][\"src\"]\n", | |
" lista_respuestas.append(girl_photo)\n", | |
" return lista_respuestas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_links(soup):\n", | |
" links_pagina = []\n", | |
" for link in soup.find_all('a'):\n", | |
" direccion_link = link.get('href',None)\n", | |
" try: \n", | |
" if \"Extra_FlowController_1id\" in direccion_link: \n", | |
" if direccion_link not in links_pagina:\n", | |
" links_pagina.append(direccion_link)\n", | |
" except:\n", | |
" print(\"\")\n", | |
" return links_pagina" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_lista_registros = pandas.DataFrame(lista_registros)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_lista_registros.to_csv(\"mujeres_ninas_mayores.csv\", quoting=csv.QUOTE_ALL)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "SyntaxError", | |
"evalue": "invalid syntax (<ipython-input-9-7be2c36cba81>, line 1)", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-9-7be2c36cba81>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m lista = [<td class=\"bea-portal-layout-placeholder-container\">\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" | |
] | |
} | |
], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment