Skip to content

Coursera平台上的课程信息爬取(Coursera courses)


go
package main

import (
	"encoding/json"
	"log"
	"os"
	"strings"
	"github.com/gocolly/colly"
)

// coursera course 结构体
type Course struct {
	Title       string
	Description string
	Creator     string
	Level       string
	URL         string
	Language    string
	Commitment  string
	HowToPass   string
	Rating      string
}

func main() {
	// 初始化collector
	c := colly.NewCollector(
		// 仅仅爬取域名 coursera.org, www.coursera.org
		colly.AllowedDomains("coursera.org", "www.coursera.org"),

		// 缓存响应页面
		colly.CacheDir("./coursera_cache"),
	)

	// 创建另一个收集器爬取课程详情
	detailCollector := c.Clone()

	courses := make([]Course, 0, 200)

    //  href 回调
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {

		// 如果回调方法中属性class 时这种长字符串
		if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" {
			return
		}
		link := e.Attr("href")
		
		// 如果 从回调中返回包含 浏览器,注册或者登录前缀
		if !strings.HasPrefix(link, "/browse") || strings.Index(link, "=signup") > -1 || strings.Index(link, "=login") > -1 {
			return
		}
		
		// 开始爬取页面链接
		e.Request.Visit(link)
	})

	// 请求前置操作
	c.OnRequest(func(r *colly.Request) {
		log.Println("visiting", r.URL.String())
	})

	// 对于每个具有name属性的HTML元素,调用回调函数。
	c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
		// 如果链接包含 "coursera.org/learn" 激活 detailCollector 收集器
		courseURL := e.Request.AbsoluteURL(e.Attr("href"))
		if strings.Index(courseURL, "coursera.org/learn") != -1 {
			detailCollector.Visit(courseURL)
		}
	})

	// 抽出课程详情
	detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
		log.Println("Course found", e.Request.URL)
		title := e.ChildText(".course-title")
		if title == "" {
			log.Println("No title found", e.Request.URL)
		}
		course := Course{
			Title:       title,
			URL:         e.Request.URL.String(),
			Description: e.ChildText("div.content"),
			Creator:     e.ChildText("div.creator-names > span"),
		}
		
		// 迭代遍历每一行包含不同信息的课程
		e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
			switch el.ChildText("td:first-child") {
			case "Language":
				course.Language = el.ChildText("td:nth-child(2)")
			case "Level":
				course.Level = el.ChildText("td:nth-child(2)")
			case "Commitment":
				course.Commitment = el.ChildText("td:nth-child(2)")
			case "How To Pass":
				course.HowToPass = el.ChildText("td:nth-child(2)")
			case "User Ratings":
				course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
			}
		})
		courses = append(courses, course)
	})

	// 开始爬虫 http://coursera.com/browse
	c.Visit("https://coursera.org/browse")

	enc := json.NewEncoder(os.Stdout)
	enc.SetIndent("", "  ")

	// 转存 json 为标准输出
	enc.Encode(courses)
}