-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
99 lines (83 loc) · 2.2 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package main
import (
"astuart.co/goq"
"fmt"
"github.com/gocolly/colly"
"io/ioutil"
"log"
"regexp"
"strconv"
"strings"
)
type Book struct {
Title string `goquery:"a.bookTitle span"`
URL string `goquery:"a,[href]"`
Ratings string `goquery:"span.minirating"`
AvgRating uint16 // 0 - 500
NumRatings uint16
}
func parseRatings(book *Book) {
r, err := regexp.Compile(`(\d\.\d{2}) avg rating — ((\d+),)?(\d+) ratings?`)
if err != nil {
log.Fatal(err)
}
groups := r.FindStringSubmatch(book.Ratings)
avgRating, err := strconv.ParseUint(strings.Replace(groups[1], ".", "", 1), 10, 9)
if err != nil {
log.Fatal(err)
}
book.AvgRating = uint16(avgRating)
ones, err := strconv.ParseUint(groups[4], 10, 16)
if err != nil {
log.Fatal(err)
}
var thousands uint64 = 0
if groups[3] != "" {
thousands, err = strconv.ParseUint(groups[3], 10, 16)
if err != nil {
log.Fatal(err)
}
}
book.NumRatings = uint16(thousands*1000 + ones)
}
func handleBookElement(bookElement *colly.HTMLElement) {
if bookElement.Attr("itemtype") != "http://schema.org/Book" {
return
}
var book *Book = &Book{}
selector := goq.NodeSelector(bookElement.DOM.Nodes)
err := goq.UnmarshalSelection(selector, book)
if err != nil {
log.Fatal(err)
}
log.Println(book.Ratings)
parseRatings(book)
fmt.Printf("%d|%d|%s\n", book.NumRatings, book.AvgRating, book.Title)
}
func main() {
log.SetOutput(ioutil.Discard)
// Instantiate default collector
c := colly.NewCollector(
colly.AllowedDomains("www.goodreads.com"),
colly.CacheDir("./cache"),
colly.MaxDepth(2),
)
c.OnHTML("tr[itemtype]", handleBookElement)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if e.Attr("class") != "next_page" {
return
}
link := e.Attr("href")
// Print link
log.Printf("Link found: %q -> %s\n", e.Text, link)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
c.Visit(e.Request.AbsoluteURL(link))
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
log.Println("Visiting", r.URL.String())
})
c.Visit("https://www.goodreads.com/author/list/12581.Cory_Doctorow")
}