Coursera平台上的课程信息爬取(Coursera courses)
go
package main
import (
"encoding/json"
"log"
"os"
"strings"
"github.com/gocolly/colly"
)
// coursera course 结构体
type Course struct {
Title string
Description string
Creator string
Level string
URL string
Language string
Commitment string
HowToPass string
Rating string
}
func main() {
// 初始化collector
c := colly.NewCollector(
// 仅仅爬取域名 coursera.org, www.coursera.org
colly.AllowedDomains("coursera.org", "www.coursera.org"),
// 缓存响应页面
colly.CacheDir("./coursera_cache"),
)
// 创建另一个收集器爬取课程详情
detailCollector := c.Clone()
courses := make([]Course, 0, 200)
// href 回调
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// 如果回调方法中属性class 时这种长字符串
if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" {
return
}
link := e.Attr("href")
// 如果 从回调中返回包含 浏览器,注册或者登录前缀
if !strings.HasPrefix(link, "/browse") || strings.Index(link, "=signup") > -1 || strings.Index(link, "=login") > -1 {
return
}
// 开始爬取页面链接
e.Request.Visit(link)
})
// 请求前置操作
c.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})
// 对于每个具有name属性的HTML元素,调用回调函数。
c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
// 如果链接包含 "coursera.org/learn" 激活 detailCollector 收集器
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
detailCollector.Visit(courseURL)
}
})
// 抽出课程详情
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".course-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
course := Course{
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
Creator: e.ChildText("div.creator-names > span"),
}
// 迭代遍历每一行包含不同信息的课程
e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
switch el.ChildText("td:first-child") {
case "Language":
course.Language = el.ChildText("td:nth-child(2)")
case "Level":
course.Level = el.ChildText("td:nth-child(2)")
case "Commitment":
course.Commitment = el.ChildText("td:nth-child(2)")
case "How To Pass":
course.HowToPass = el.ChildText("td:nth-child(2)")
case "User Ratings":
course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
}
})
courses = append(courses, course)
})
// 开始爬虫 http://coursera.com/browse
c.Visit("https://coursera.org/browse")
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
// 转存 json 为标准输出
enc.Encode(courses)
}