Created
July 9, 2018 12:12
-
-
Save GlulkAlex/9ef33d2e16f966105362c80bc86dd6ff to your computer and use it in GitHub Desktop.
Example of web scraping with Python and PyMongo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#utf_8 U8, UTF, utf8 | |
"""read_from_json.py: | |
Example: | |
of Python's | |
web scraping | |
with PyMongo | |
""" | |
__author__ = "GlukAlex" | |
import pymongo | |
import sys | |
import json | |
import requests | |
#import urllib.request | |
# AttributeError: 'module' object has no attribute 'request' | |
#import urllib | |
# ImportError: No module named 'urllib2' | |
#import urllib2 | |
"""Note | |
The 'urllib2' module | |
has been split across several modules | |
in Python 3 named | |
'urllib.request' and | |
'urllib.error'. | |
The '2to3 tool' will | |
automatically adapt `imports` | |
when converting your sources to Python 3. | |
""" | |
"""Note | |
The 'urllib' module | |
has been split into parts and | |
renamed in Python 3 to | |
'urllib.request', | |
'urllib.parse', and | |
'urllib.error'. | |
The '2to3 tool' will | |
automatically adapt imports | |
when converting your sources to Python 3. | |
Also note that | |
the 'urllib.request.urlopen()' function | |
in Python 3 is equivalent to | |
'urllib2.urlopen()' and that | |
'urllib.urlopen()' has been removed. | |
""" | |
def get_N_Insert_Page_Content(url: str = ""): | |
# connect to dataBase | |
client = pymongo.MongoClient( | |
'mongodb://localhost' | |
#'localhost', | |
# default port from MongoDB config files | |
# for server to listen to | |
#27017 | |
) | |
connection = client | |
# attach to 'reddit' database | |
db = connection.reddit | |
# handle to 'stories' collection | |
stories = ( | |
db.stories | |
) | |
# clear / delete entirly existing collection | |
stories.drop() | |
# JSON Response Content | |
# There’s also | |
# a builtin JSON decoder, | |
# in case you’re dealing with JSON data: | |
#>>> import requests | |
#>>> r = requests.get('https://api.github.com/events') | |
#>>> r.json() | |
# In case | |
# the JSON `decoding` fails, | |
# 'r.json' raises an exception. | |
# For example, | |
# if the `response` gets a '204' (No Content), or | |
# if the `response` contains `invalid` JSON, | |
# attempting 'r.json' raises | |
#ValueError: No JSON object could be decoded. | |
# get specified web page | |
reddit_page = ( | |
#"https://www.reddit.com/r/technology/.json" | |
"http://localhost:8888/files/PyMongo/reddit_com_technology.json" | |
) | |
#urllib.request.urlopen( | |
# url, | |
# data=None, | |
# [timeout, ]*, | |
# cafile=None, | |
# capath=None, | |
# cadefault=False, | |
# context=None) | |
# Open the URL url, | |
# which can be | |
# either a string or | |
# a Request object. | |
page_Content = ( | |
requests.get(reddit_page) | |
# For 'http' and 'https' `urls`, | |
# this function returns | |
# a 'http.client.HTTPResponse' object | |
# which has | |
# the following 'HTTPResponse Objects' methods. | |
#urllib.request.urlopen(reddit_page) | |
#urllib2.urlopen(reddit_page) | |
) | |
# AttributeError: 'bytes' object has no attribute 'read' | |
# DEBUG | |
print("""type(page_Content) is: {}""".format(type(page_Content)) ) | |
page_JSON = page_Content.json() | |
# DEBUG | |
print("""type(page_JSON) is: {}""".format(type(page_JSON)) ) | |
if type(page_JSON) == str or type(page_JSON) == dict: | |
#print("""page_JSON is: {0['data']:50}""".format(page_JSON) ) | |
print("""page_JSON is: {}""".format(page_JSON, width = 50) ) | |
#json.load( | |
# fp, | |
# cls=None, | |
# object_hook=None, | |
# parse_float=None, | |
# parse_int=None, | |
# parse_constant=None, | |
# object_pairs_hook=None, | |
# **kw) | |
# Deserialize 'fp' | |
# (a '.read()' - supporting file-like object | |
# containing a JSON document) to | |
# a Python object | |
# using this conversion table. | |
#page_Content_Parsed = ( | |
#json.load( | |
# An 'HTTPResponse' instance | |
# wraps the `HTTP response` from the `server`. | |
# It provides | |
# access to | |
# the `request headers` and | |
# the `entity body`. | |
# The response is | |
# an `iterable` object and | |
# can be used in a 'with' statement. | |
#HTTPResponse.read([amt]) | |
# Reads and returns | |
# the `response body`, or | |
# up to the next 'amt' bytes. | |
#page_Content.read() | |
#page_Content.json() | |
#) | |
#) | |
# Receiving | |
# a status '429' is | |
# not an error, | |
# it is the other server "kindly" asking you to | |
# please stop spamming requests. | |
#json_dict.get('data').get('children')[0].get('data') | |
if page_JSON != {'error': 429}: | |
content_Topics = ( | |
#page_JSON["data"]["children"] | |
page_JSON.get('data').get('children') | |
) | |
# populate 'stories' with page data | |
print("""populateing 'stories' with page data ...""".format()) | |
# iterate over array of objects | |
#for item in page_Content_Parsed["data"]["children"]: | |
for item in content_Topics: | |
# side effect | |
#insert_one(document) | |
# Insert a single document. | |
if item: | |
stories.insert_one(item["data"]) | |
# unit test | |
if __name__ == "__main__": | |
# OK ? | |
get_N_Insert_Page_Content() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment