Skip to content

Instantly share code, notes, and snippets.

@jrainlau
Created January 22, 2019 09:42
Show Gist options
  • Save jrainlau/55a46bd0d3318e948331dc81e3d22d32 to your computer and use it in GitHub Desktop.
Save jrainlau/55a46bd0d3318e948331dc81e3d22d32 to your computer and use it in GitHub Desktop.
wallpapers爬虫
const puppeteer = require('puppeteer')
const fs = require('fs')
const cateNames = ['aero', 'auroras', 'black', 'bokeh', 'colorful', 'creative', 'fresh', 'macro', 'patterns', 'rainbow', 'vector_art', 'white', 'animals', 'birds', 'horses', 'insects', 'others', 'pets', 'reptiles_&_frogs', 'sea', 'wild', 'architecture', 'army', 'artistic', '3d', 'abstract', 'anime', 'drawings', 'fantasy', 'graffiti', 'grunge', 'sculpture', 'typography', 'urban', 'black_and_white', 'cartoons', 'bee_movie', 'bolt', 'brave', 'cars', 'coraline', 'futurama', 'gnomeo_&_juliet', 'ice_age', 'incredibles', 'kung_fu_panda', 'madagascar', 'monsters_inc', 'ninja_turtles', 'old_disney', 'open_season', 'others', 'planet_51', 'ratatouille', 'shrek', 'south_park', 'tangled', 'the_princess_and_the_frog', 'the_simpsons', 'tinker_bell', 'toy_story', 'up', 'walle', 'celebrities', 'models', 'movies', 'music', 'charity', 'city', 'computers', 'android', 'firefox', 'hardware', 'linux', 'mac', 'nvidia', 'others', 'vaio', 'web', 'windows', 'cute', 'elements', 'earth', 'fire', 'water', 'food_and_drink', 'funny', 'games', 'age_of_empires', 'angry_birds', 'anno_1404', "assassin's_creed", 'avatar', 'batman', 'battlefield', 'bioshock', 'brink', 'call_of_duty', 'chess', 'command_and_conquer', 'crysis', "dante's_inferno", 'darksiders', 'dead_space', 'destiny', 'deus_ex', 'devil_may_cry', 'diablo', 'dota', 'dragon_age', 'driver', 'empire_total_war', 'fable', 'fallout', 'far_cry', 'fear', 'final_fantasy', 'forza_motorsport', 'gears_of_war', 'ghost_recon', 'god_of_war', 'gran_turismo', 'grand_theft_auto', 'guild_wars', 'half_life', 'halo', 'heavenly_sword', 'heroes', 'hitman', 'infamous', 'killzone', 'l.a._noire', 'left_4_dead', 'machinarium', 'mario', 'mass_effect', 'medal_of_honor', 'metal_gear', 'midnight_club', 'minecraft', "mirror's_edge", 'mortal_kombat', 'need_for_speed', 'other_games', 'overlord', 'poker', 'portal', 'prince_of_persia', 'prototype', 'quake_wars', 'rayman', 'rayman_raving_rabbids', 'red_dead_redemption', 'resident_evil', 'rift', 'rockstar_games', 's.t.a.l.k.e.r.', 'splinter_cell', 'star_wars', 'starcraft', 'street_fighter', 'the_elder_scrolls', 'the_witcher', 'thief', 'tom_clancy', 'tomb_raider', 'trine', 'two_worlds', 'uncharted', 'valkyria_chronicles', 'warhammer', 'watch_dogs', 'world_of_warcraft', 'girls', 'holidays', 'birthday', "childrenren's_day", 'christmas', 'easter', "father's_day", 'halloween', 'independence_day', "mother's_day", 'new_year', "saint_patrick's_day", "valentine's_day", 'love', 'motors', 'airplane', 'atv', 'cars', 'classic_cars', 'motorcycles', 'others', 'trains', 'movies', '28_weeks_later', '300', 'alice_in_wonderland', 'angels_and_demons', 'avatar', 'batman', 'captain_america', 'clash_of_the_titans', 'game_of_thrones', 'harry_potter', 'high_school_musical', 'hitman', 'iron_man', 'king_kong', 'lost', 'man_of_steel', 'other_movies', 'oz_the_great_and_powerful', 'pirates_of_the_caribbean', 'prince_of_persia', 'real_steel', 'robin_hood', 'sex_and_the_city', 'snow_white_&_the_huntsman', 'spider-man', 'star_trek', 'star_wars', 'sucker_punch', 'the_avengers', 'the_hobbit', 'the_incredible_hulk', 'thor', 'transformers', 'tron_legacy', 'twilight', 'watchmen', 'x-men', 'music', 'nature', 'beach', 'desert', 'flowers', 'forests', 'lakes', 'landscape', 'mountains', 'rivers', 'sun_&_sky', 'waterfalls', 'seasons', 'autumn', 'calendar', 'spring', 'summer', 'winter', 'space', '2016_summer_olympics', 'baseball', 'basketball', 'biking', 'boxing', 'fitness', 'football', 'formula_1', 'free_running', 'golf', 'motorcycle_racing', 'other_sports', 'parkour', 'skateboarding', 'skiing', 'surfing', 'tennis', 'winter_olympic_games', 'wrestling', 'travel', 'africa', 'america', 'antarctica', 'asia', 'europe', 'islands', 'maps', 'oceania', 'other', 'vintage']
const crawler = async (url, page) => {
console.log(`Dealing with "${url}"...`)
await page.goto(url)
await page.waitForSelector('.wallpapers')
const wallpapers = await page.$$eval('.wallpapers', wallpaperNodeList => {
var paginationArr = document.querySelectorAll('.pagination')
var paginations = paginationArr[paginationArr.length -1].children
var pageAmount = paginations[paginations.length - 2].text
var screenResolution = document.querySelector('#header > div.screen-res > span:nth-child(4) > strong').textContent.replace(/\s+/g, '')
var onShowWallpapers = wallpaperNodeList[wallpaperNodeList.length - 1]
var wallpapers = Array.from(onShowWallpapers.querySelectorAll('img')).map(img => {
var imgName = img.src.replace(/http:\/\/hd\.wallpaperswide\.com\/thumbs\/|\-t1\.jpg/g, '')
return {
thumb: img.src,
name: imgName,
downloadUrl: 'http://wallpaperswide.com/download/' + imgName + '-wallpaper-' + screenResolution + '.jpg',
total: pageAmount
}
})
return wallpapers
})
return wallpapers
}
const autoRun = async (cateName, pageIndex, page) => {
let index = pageIndex
let pageTotal = 0
const url = `http://wallpaperswide.com/${cateName}-desktop-wallpapers/page/${index}`
const wallpapers = await crawler(url, page)
const cateRaw = fs.readFileSync('./cate.json')
const categories = JSON.parse(cateRaw)
categories[url] = wallpapers
fs.writeFileSync('./cate.json', JSON.stringify(categories, null, 2))
if (wallpapers[0].total) {
pageTotal = Number(wallpapers[0].total)
}
while (index < pageTotal) {
index++
await autoRun(cateName, index, page)
}
return categories
}
const run = async () => {
const browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox']
})
const page = await browser.newPage()
while (cateNames.length) {
const cateName = cateNames.shift()
await autoRun(cateName, 1, page)
}
await browser.close()
}
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment