Coursera平台上的课程信息爬取(Coursera courses)
package main
import (
// DATE_FORMAT default format date used in openedx
const DATE_FORMAT = "Jan 02, 2006"
// Course store openedx course data
type Course struct {
CourseID string
Run string
Name string
Number string
StartDate *time.Time
EndDate *time.Time
URL string
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Using IndonesiaX as sample
colly.AllowedDomains("", ""),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
courses := make([]Course, 0, 200)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
if !strings.HasPrefix(link, "/courses/") {
// start scaping the page under the link found
c.OnHTML("div[class=content-wrapper]", func(e *colly.HTMLElement) {
if e.DOM.Find("section.course-info").Length() == 0 {
title := strings.Split(e.ChildText(".course-title"), "\n")[0]
course_id := e.ChildAttr("input[name=course_id]", "value")
start_date, _ := time.Parse(DATE_FORMAT, e.ChildText("span.start-date"))
end_date, _ := time.Parse(DATE_FORMAT, e.ChildText(""))
var run string
if len(strings.Split(course_id, "_")) > 1 {
run = strings.Split(course_id, "_")[1]
course := Course{
CourseID: course_id,
Run: run,
Name: title,
Number: e.ChildText("span.course-number"),
StartDate: &start_date,
EndDate: &end_date,
URL: fmt.Sprintf("/courses/%s/about", course_id),
courses = append(courses, course)
// Start scraping on https://openedxdomain/courses
// Convert results to JSON data if the scraping job has finished
jsonData, err := json.MarshalIndent(courses, "", " ")
if err != nil {
// Dump json to the standard output (can be redirected to a file)