Created
May 9, 2020 16:30
-
-
Save movsb/c9b9d908729592190ae206b2a4218037 to your computer and use it in GitHub Desktop.
A golang program to scrape [Douban Movie Top250](https://movie.douban.com/top250).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"net/http" | |
"os" | |
"strconv" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/WeiZhang555/tabwriter" | |
) | |
var config struct { | |
UserAgent string | |
} | |
func init() { | |
config.UserAgent = `Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:75.0) Gecko/20100101 Firefox/75.0` | |
} | |
// Item ... | |
type Item struct { | |
Titles []string | |
Score float64 | |
Link string | |
Type string | |
Quote string | |
} | |
// page starts from 1 | |
func scrape(page int) []*Item { | |
offset := (page - 1) * 25 | |
u := fmt.Sprintf(`https://movie.douban.com/top250?start=%d`, offset) | |
req, err := http.NewRequest(http.MethodGet, u, nil) | |
if err != nil { | |
panic(err) | |
} | |
req.Header.Set(`User-Agent`, config.UserAgent) | |
resp, err := http.DefaultClient.Do(req) | |
if err != nil { | |
panic(err) | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 200 { | |
io.Copy(os.Stderr, resp.Body) | |
panic(resp.Status) | |
} | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
panic(err) | |
} | |
var items []*Item | |
doc.Find(`#content .article ol.grid_view > li`).Each(func(i int, s *goquery.Selection) { | |
var item Item | |
a := s.Find(`.hd > a`) | |
item.Link, _ = a.Attr(`href`) | |
for _, t := range strings.Split(a.Text(), `/`) { | |
item.Titles = append(item.Titles, strings.TrimSpace(t)) | |
} | |
item.Type = strings.TrimSpace(strings.Split(s.Find(`.bd > p`).Text(), "\n")[2]) | |
item.Score, _ = strconv.ParseFloat(s.Find(`.bd .rating_num`).Text(), 64) | |
item.Quote = s.Find(`.bd .quote span`).Text() | |
items = append(items, &item) | |
}) | |
return items | |
} | |
func main() { | |
var items []*Item | |
for i := 1; i <= 10; i++ { | |
items = append(items, scrape(i)...) | |
} | |
w := tabwriter.NewWriter(os.Stdout, 8, 8, 3, ' ', 0) | |
defer w.Flush() | |
for _, item := range items { | |
fmt.Fprintf(w, | |
"%s\t%.1f\t%s\t%s\t%s\n", | |
item.Titles[0], item.Score, item.Link, item.Type, item.Quote, | |
) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment