ricalanis · April 3, 2018 15:35
diff --git a/11112017-ricalanis-CNS-mujeresyniñasdesaparecidas.ipynb b/11112017-ricalanis-CNS-mujeresyniñasdesaparecidas.ipynb
diff --git a/Alerta AMBER Guanajuato.ipynb b/Alerta AMBER Guanajuato.ipynb
diff --git a/Locatel.ipynb b/Locatel.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.keys import Keys\n",
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "import pandas\n",
    "import simplejson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "driver = webdriver.Chrome()\n",
    "driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/consultaextra.asp\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "button_start = driver.find_element_by_name(\"Image1\")\n",
    "button_start.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links(driver):\n",
    "    try:\n",
    "        links = [link[\"href\"].replace(\"verextraviado2.asp?\",\"\") for link in BeautifulSoup(driver.page_source).find_all(\"table\")[2].find_all(\"a\")]\n",
    "        output_total = []\n",
    "        for link in links:\n",
    "            output = {}\n",
    "            for value in link.split(\"&\"):\n",
    "                try:\n",
    "                    output[value.split(\"=\")[0]]=value.split(\"=\")[1]\n",
    "                except:\n",
    "                    next\n",
    "            output_total.append(output)\n",
    "    except:\n",
    "        next\n",
    "    return output_total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n",
      "\n",
      " BeautifulSoup([your markup])\n",
      "\n",
      "to this:\n",
      "\n",
      " BeautifulSoup([your markup], \"lxml\")\n",
      "\n",
      "  markup_type=markup_type))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "2\n",
      "3\n",
      "4\n",
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n",
      "10\n",
      "11\n",
      "12\n",
      "13\n",
      "14\n",
      "15\n",
      "16\n",
      "17\n",
      "18\n",
      "19\n",
      "20\n",
      "21\n",
      "22\n",
      "23\n",
      "24\n",
      "25\n",
      "26\n",
      "27\n",
      "28\n",
      "29\n",
      "30\n",
      "31\n",
      "32\n",
      "33\n",
      "34\n",
      "35\n",
      "36\n",
      "37\n",
      "38\n",
      "39\n",
      "40\n",
      "41\n",
      "42\n",
      "43\n",
      "44\n",
      "45\n",
      "46\n",
      "47\n",
      "48\n",
      "49\n",
      "50\n",
      "51\n",
      "52\n",
      "53\n",
      "54\n",
      "55\n",
      "56\n",
      "57\n",
      "58\n",
      "59\n",
      "60\n",
      "61\n",
      "62\n",
      "63\n",
      "64\n",
      "65\n",
      "66\n",
      "67\n",
      "68\n",
      "69\n",
      "70\n"
     ]
    }
   ],
   "source": [
    "output = []\n",
    "for i in range(1,71):\n",
    "    driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/vernomextra.asp?pagina=\"+str(i))\n",
    "    output = output + get_links(driver)\n",
    "    print(str(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n",
      "\n",
      " BeautifulSoup([your markup])\n",
      "\n",
      "to this:\n",
      "\n",
      " BeautifulSoup([your markup], \"lxml\")\n",
      "\n",
      "  markup_type=markup_type))\n"
     ]
    }
   ],
   "source": [
    "data = get_links(driver)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(output).to_csv(\"Locatel_EDOMEX.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11112017-ricalanis-CNS-mujeresyniñasdesaparecidas.ipynb\r\n",
      "AMBER_Guanajuato.csv\r\n",
      "Alerta AMBER Guanajuato.ipynb\r\n",
      "\u001b[1m\u001b[36mBases de Datos Finales\u001b[m\u001b[m\r\n",
      "CodigoFinal-Copy1.ipynb\r\n",
      "CodigoFinal.ipynb\r\n",
      "Distribuciondeañosporbase.csv\r\n",
      "Locatel.ipynb\r\n",
      "Locatel_EDOMEX.csv\r\n",
      "MayoresMenoresdeEdad.xlsx\r\n",
      "Mujeres_Comun.csv\r\n",
      "Mujeres_Comun.csv.bak\r\n",
      "OGPMiniGrant.docx\r\n",
      "PGJ.ipynb\r\n",
      "PorcentajedeMayoresdeEdad.csv\r\n",
      "PorcentajesPorAño.xlsx\r\n",
      "RNPRED Explore.ipynb\r\n",
      "Scrapper_RNPED.ipynb\r\n",
      "Untitled.ipynb\r\n",
      "datos_mayores.csv\r\n",
      "datos_menores.csv\r\n",
      "\u001b[1m\u001b[36mfierbois\u001b[m\u001b[m\r\n",
      "google.png\r\n",
      "\u001b[1m\u001b[36mimages\u001b[m\u001b[m\r\n",
      "mujeres_desaparecidas_join.csv\r\n",
      "output_tables.txt\r\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/Scrapper_RNPED.ipynb b/Scrapper_RNPED.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 70,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from selenium import webdriver\n",
	"from selenium.webdriver.common.keys import Keys\n",
	"from bs4 import BeautifulSoup\n",
	"import requests\n",
	"import pandas\n",
	"import simplejson"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 71,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"driver = webdriver.Chrome()\n",
	"driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/consultaextra.asp\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 72,
	"metadata": {},
	"outputs": [],
	"source": [
	"button_start = driver.find_element_by_name(\"Image1\")\n",
	"button_start.click()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 73,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 74,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_links(driver):\n",
	" try:\n",
	" links = [link[\"href\"].replace(\"verextraviado2.asp?\",\"\") for link in BeautifulSoup(driver.page_source).find_all(\"table\")[2].find_all(\"a\")]\n",
	" output_total = []\n",
	" for link in links:\n",
	" output = {}\n",
	" for value in link.split(\"&\"):\n",
	" try:\n",
	" output[value.split(\"=\")[0]]=value.split(\"=\")[1]\n",
	" except:\n",
	" next\n",
	" output_total.append(output)\n",
	" except:\n",
	" next\n",
	" return output_total"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 75,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
	"\n",
	"The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n",
	"\n",
	" BeautifulSoup([your markup])\n",
	"\n",
	"to this:\n",
	"\n",
	" BeautifulSoup([your markup], \"lxml\")\n",
	"\n",
	" markup_type=markup_type))\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1\n",
	"2\n",
	"3\n",
	"4\n",
	"5\n",
	"6\n",
	"7\n",
	"8\n",
	"9\n",
	"10\n",
	"11\n",
	"12\n",
	"13\n",
	"14\n",
	"15\n",
	"16\n",
	"17\n",
	"18\n",
	"19\n",
	"20\n",
	"21\n",
	"22\n",
	"23\n",
	"24\n",
	"25\n",
	"26\n",
	"27\n",
	"28\n",
	"29\n",
	"30\n",
	"31\n",
	"32\n",
	"33\n",
	"34\n",
	"35\n",
	"36\n",
	"37\n",
	"38\n",
	"39\n",
	"40\n",
	"41\n",
	"42\n",
	"43\n",
	"44\n",
	"45\n",
	"46\n",
	"47\n",
	"48\n",
	"49\n",
	"50\n",
	"51\n",
	"52\n",
	"53\n",
	"54\n",
	"55\n",
	"56\n",
	"57\n",
	"58\n",
	"59\n",
	"60\n",
	"61\n",
	"62\n",
	"63\n",
	"64\n",
	"65\n",
	"66\n",
	"67\n",
	"68\n",
	"69\n",
	"70\n"
	]
	}
	],
	"source": [
	"output = []\n",
	"for i in range(1,71):\n",
	" driver.get(\"http://www.edomexico.gob.mx/locatel/htm/asp/vernomextra.asp?pagina=\"+str(i))\n",
	" output = output + get_links(driver)\n",
	" print(str(i))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
	"\n",
	"The code that caused this warning is on line 193 of the file /usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n",
	"\n",
	" BeautifulSoup([your markup])\n",
	"\n",
	"to this:\n",
	"\n",
	" BeautifulSoup([your markup], \"lxml\")\n",
	"\n",
	" markup_type=markup_type))\n"
	]
	}
	],
	"source": [
	"data = get_links(driver)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 83,
	"metadata": {},
	"outputs": [],
	"source": [
	"pd.DataFrame(output).to_csv(\"Locatel_EDOMEX.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 84,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"11112017-ricalanis-CNS-mujeresyniñasdesaparecidas.ipynb\r\n",
	"AMBER_Guanajuato.csv\r\n",
	"Alerta AMBER Guanajuato.ipynb\r\n",
	"\u001b[1m\u001b[36mBases de Datos Finales\u001b[m\u001b[m\r\n",
	"CodigoFinal-Copy1.ipynb\r\n",
	"CodigoFinal.ipynb\r\n",
	"Distribuciondeañosporbase.csv\r\n",
	"Locatel.ipynb\r\n",
	"Locatel_EDOMEX.csv\r\n",
	"MayoresMenoresdeEdad.xlsx\r\n",
	"Mujeres_Comun.csv\r\n",
	"Mujeres_Comun.csv.bak\r\n",
	"OGPMiniGrant.docx\r\n",
	"PGJ.ipynb\r\n",
	"PorcentajedeMayoresdeEdad.csv\r\n",
	"PorcentajesPorAño.xlsx\r\n",
	"RNPRED Explore.ipynb\r\n",
	"Scrapper_RNPED.ipynb\r\n",
	"Untitled.ipynb\r\n",
	"datos_mayores.csv\r\n",
	"datos_menores.csv\r\n",
	"\u001b[1m\u001b[36mfierbois\u001b[m\u001b[m\r\n",
	"google.png\r\n",
	"\u001b[1m\u001b[36mimages\u001b[m\u001b[m\r\n",
	"mujeres_desaparecidas_join.csv\r\n",
	"output_tables.txt\r\n"
	]
	}
	],
	"source": [
	"!ls"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}