Last active
January 30, 2023 02:45
-
-
Save blacknon/a184df81ece333f93ca7c4c4eabcf54e to your computer and use it in GitHub Desktop.
googleイメージ検索を行う検証用スクリプト(python)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- encoding: UTF-8 -*- | |
import chromedriver_autoinstaller | |
import time | |
import re | |
import demjson | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
def click_img(we): | |
None | |
def main(): | |
# クリックなど動作後に待つ時間(秒) | |
sleep_between_interactions = 2 | |
# 「検索結果をもっと表示」ボタン | |
more_button_class_name = 'mye4qd' | |
# ダウンロードする枚数 | |
# download_num = 100 | |
# 検索ワード | |
query = "水瀬いのり" | |
# 画像検索用のurl | |
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" | |
# use chrome driver | |
chromedriver_autoinstaller.install() | |
# サムネイル画像のURL取得 | |
driver = webdriver.Chrome() | |
driver.get(search_url.format(q=query)) | |
# スクロールダウンと「検索結果をもっと表示」のクリックを実行 | |
for i in range(10): | |
# スクロールダウン | |
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") | |
# 待機処理 | |
time.sleep(sleep_between_interactions) | |
# 「検索結果をもっと表示」ボタンをクリック | |
try: | |
# | |
driver.find_element_by_class_name(more_button_class_name).click() | |
# 待機処理 | |
time.sleep(sleep_between_interactions) | |
except Exception: | |
pass | |
# サムネイル画像のリンクを取得(ここでコケる場合はセレクタを実際に確認して変更する) | |
# thumbnail_results = driver.find_elements_by_css_selector("img.rg_i") | |
# print(len(thumbnail_results)) | |
# | |
page_source = driver.page_source | |
# 前処理リクエストから、crumbパラメータの値を取得する(正規表現) | |
# pattern = r'\["http[^"]+",' | |
# data = re.findall(pattern, page_source) | |
soup = BeautifulSoup(page_source, 'lxml') | |
image_divs = soup.find_all('script') | |
result = list() | |
for div in image_divs: | |
# TODO: reintegrate the constraints? | |
# meta = json.loads(div.text) | |
# if 'ou' in meta and 'ity' in meta and meta['ity'] is not "" and "lookaside.fbsbx.com" not in meta['ou']: | |
# yield dict(file_url=meta['ou']) | |
txt = div.string | |
if txt is None or not txt.startswith('AF_initDataCallback'): | |
continue | |
if 'ds:1' not in txt: | |
continue | |
txt = re.sub(r"^AF_initDataCallback\(({.*key: 'ds:\d',.+, data:.+})\);$", | |
"\\1", txt, 0, re.DOTALL) | |
meta = demjson.decode(txt)['data'] | |
data = meta[31][0][12][2] | |
# uris = [img[1][3][0] for img in data if img[0] == 1] | |
for img in data: | |
if img[0] == 1: | |
uri = img[1][3][0] | |
puri = img[1][9]['2003'][2] | |
title = img[1][9]['2003'][3] | |
result.append({'link': uri, 'title': title, 'pagelink': puri}) | |
print(result) | |
driver.quit() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment