From a40f9dd0c32eccd20527ed33d57790e3ea1ab08d Mon Sep 17 00:00:00 2001 From: Baoshuo Date: Tue, 19 Nov 2024 16:22:04 +0800 Subject: [PATCH 1/5] feat: add spider for info22.fzu.edu.cn --- work2/renbaoshuo/.gitignore | 1 + work2/renbaoshuo/fzu/go.mod | 11 ++ work2/renbaoshuo/fzu/go.sum | 40 +++++ work2/renbaoshuo/fzu/main.go | 236 ++++++++++++++++++++++++++++++ work2/renbaoshuo/fzu/res/.gitkeep | 0 5 files changed, 288 insertions(+) create mode 100644 work2/renbaoshuo/.gitignore create mode 100644 work2/renbaoshuo/fzu/go.mod create mode 100644 work2/renbaoshuo/fzu/go.sum create mode 100644 work2/renbaoshuo/fzu/main.go create mode 100644 work2/renbaoshuo/fzu/res/.gitkeep diff --git a/work2/renbaoshuo/.gitignore b/work2/renbaoshuo/.gitignore new file mode 100644 index 0000000..acf02e7 --- /dev/null +++ b/work2/renbaoshuo/.gitignore @@ -0,0 +1 @@ +fzu/res diff --git a/work2/renbaoshuo/fzu/go.mod b/work2/renbaoshuo/fzu/go.mod new file mode 100644 index 0000000..17139ca --- /dev/null +++ b/work2/renbaoshuo/fzu/go.mod @@ -0,0 +1,11 @@ +module main + +go 1.23 + +toolchain go1.23.3 + +require ( + github.com/PuerkitoBio/goquery v1.10.0 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + golang.org/x/net v0.29.0 // indirect +) diff --git a/work2/renbaoshuo/fzu/go.sum b/work2/renbaoshuo/fzu/go.sum new file mode 100644 index 0000000..78d3c7a --- /dev/null +++ b/work2/renbaoshuo/fzu/go.sum @@ -0,0 +1,40 @@ +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/work2/renbaoshuo/fzu/main.go b/work2/renbaoshuo/fzu/main.go new file mode 100644 index 0000000..0d68555 --- /dev/null +++ b/work2/renbaoshuo/fzu/main.go @@ -0,0 +1,236 @@ +package main + +import ( + "fmt" + "net/http" + "regexp" + "strconv" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + +const ( + baseURL = "https://info22.fzu.edu.cn/" + listURL = "lm_list.jsp?urltype=tree.TreeTempUrl&wbtreeid=1460" + + fmtWithBaseURL = baseURL + "%s" + fmtWithListURL = baseURL + listURL + "&totalpage=%d&PAGENUM=%d" + + // 使用 diff 的方式进行计算,以免页面总数变化对程序的影响(前提:历史文件不删除) + startPageDiffWithLastPage = (1028 - 230) + endPageDiffWithLastPage = (1028 - 347) + + // 选择器 + selectorTotalPagesLink = `.p_last a` + selectorList = `body > div.sy-content > div > div.right.fr > div.list.fl > ul > li` + selectorListLink = `a[href^="content.jsp"]` + selectorTitle = `body > div.wa1200w > div.conth > form > div.conth1` + selectorContent = `#v_news_content` + + // 正则 + patternTotalPages = `(?s)totalpage=(?P\d+)` + patternDate = `(?s)
日期: (?P.*?)  ` + patternAuthor = `(?s)信息来源..(?P.*?)\n..` +) + +var ( + startTime, _ = time.Parse("2006-01-02 15:04:05", "2020-01-01 00:00:00") + endTime, _ = time.Parse("2006-01-02 15:04:05", "2021-09-30 23:59:59") +) + +func httpGet(url string) (string, error) { + client := &http.Client{} + + resp, err := client.Get(url) + if err != nil { + fmt.Printf("Error occurred while getting %s: %s", url, err) + + return "", err + } + + defer resp.Body.Close() + + buf := make([]byte, 4096) + res := "" + + for { + n, _ := resp.Body.Read(buf) + if n == 0 { + break + } + res += string(buf[:n]) + } + + return res, nil +} + +func parseTotalPages(html string) (int, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return 0, err + } + + href, exists := doc.Find(selectorTotalPagesLink).Attr("href") + if !exists { + return 0, fmt.Errorf("no last page link found") + } + regexp := regexp.MustCompile(patternTotalPages) + matches := regexp.FindStringSubmatch(href) + if len(matches) < 2 { + return 0, fmt.Errorf("no total page number found") + } + + totalPages, err := strconv.Atoi(matches[1]) + if err != nil { + return 0, err + } + + return totalPages, nil +} + +func getTotalPages() (int, error) { + url := fmt.Sprintf(fmtWithBaseURL, listURL) + + html, err := httpGet(url) + if err != nil { + return 0, err + } + + return parseTotalPages(html) +} + +func parseListPage(html string) ([]string, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return nil, err + } + + res := []string{} + + doc.Find(selectorList).Each(func(i int, s *goquery.Selection) { + href, exists := s.Find(selectorListLink).Attr("href") + if !exists { + return + } + + pubDate := s.Find("span").Text() + timePubDate, err := time.Parse("2006-01-02", pubDate) + if err != nil { + return + } + + if timePubDate.Before(startTime) || timePubDate.After(endTime) { + return + } + + res = append(res, href) + }) + + return res, nil +} + +func getListPage(page int, totalPages int) ([]string, error) { + url := fmt.Sprintf(fmtWithListURL, totalPages, page) + + html, err := httpGet(url) + if err != nil { + return nil, err + } + + return parseListPage(html) +} + +type ArticleResult struct { + Title string + Content string + Author string + PostDate string // YYYY-MM-DD +} + +func parseArticlePage(html string) (ArticleResult, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return ArticleResult{}, err + } + + title := doc.Find(selectorTitle).Text() + content := doc.Find(selectorContent).Text() + + regexpDate := regexp.MustCompile(patternDate) + regexpAuthor := regexp.MustCompile(patternAuthor) + + matchesDate := regexpDate.FindStringSubmatch(html) + matchesAuthor := regexpAuthor.FindStringSubmatch(html) + + if len(matchesDate) < 2 || len(matchesAuthor) < 2 { + return ArticleResult{}, fmt.Errorf("no date or author found") + } + + postDate := matchesDate[1] + author := matchesAuthor[1] + + return ArticleResult{ + Title: title, + Author: author, + Content: content, + PostDate: postDate, + }, nil +} + +func getArticlePage(urlSuffix string) (ArticleResult, error) { + url := fmt.Sprintf(fmtWithBaseURL, urlSuffix) + + html, err := httpGet(url) + if err != nil { + return ArticleResult{}, err + } + + return parseArticlePage(html) +} + +func main() { + totalPages, err := getTotalPages() + if err != nil { + fmt.Println("Error occurred while getting total pages:", err) + return + } + + fmt.Println("Total pages:", totalPages) + + urlList := []string{} + + for page := (totalPages - startPageDiffWithLastPage); page <= (totalPages - endPageDiffWithLastPage); page++ { + list, err := getListPage(page, totalPages) + if err != nil { + fmt.Println("Error occurred while getting list page:", err) + return + } + + urlList = append(urlList, list...) + } + + fmt.Println("Total articles:", len(urlList)) + + articles := []ArticleResult{} + + for _, url := range urlList { + article, err := getArticlePage(url) + if err != nil { + fmt.Println("Error occurred while getting article page:", err) + return + } + + articles = append(articles, article) + } + + fmt.Println("Total articles parsed:", len(articles)) + + // TODO: Save articles to database + + // print articles basic info + for _, article := range articles { + fmt.Printf("%s\t%s\t\t\t%s\n", article.PostDate, article.Author, article.Title) + } +} diff --git a/work2/renbaoshuo/fzu/res/.gitkeep b/work2/renbaoshuo/fzu/res/.gitkeep new file mode 100644 index 0000000..e69de29 From 58079cc24cfc118fa2783761be21f24bdf772ac2 Mon Sep 17 00:00:00 2001 From: Baoshuo Date: Tue, 19 Nov 2024 22:54:00 +0800 Subject: [PATCH 2/5] feat: storage articles into sqlite database --- work2/renbaoshuo/fzu/go.mod | 1 + work2/renbaoshuo/fzu/go.sum | 2 ++ work2/renbaoshuo/fzu/main.go | 59 ++++++++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/work2/renbaoshuo/fzu/go.mod b/work2/renbaoshuo/fzu/go.mod index 17139ca..07dcc8a 100644 --- a/work2/renbaoshuo/fzu/go.mod +++ b/work2/renbaoshuo/fzu/go.mod @@ -7,5 +7,6 @@ toolchain go1.23.3 require ( github.com/PuerkitoBio/goquery v1.10.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/mattn/go-sqlite3 v1.14.24 // indirect golang.org/x/net v0.29.0 // indirect ) diff --git a/work2/renbaoshuo/fzu/go.sum b/work2/renbaoshuo/fzu/go.sum index 78d3c7a..dd2893c 100644 --- a/work2/renbaoshuo/fzu/go.sum +++ b/work2/renbaoshuo/fzu/go.sum @@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbav github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM= +github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= diff --git a/work2/renbaoshuo/fzu/main.go b/work2/renbaoshuo/fzu/main.go index 0d68555..63ab7f4 100644 --- a/work2/renbaoshuo/fzu/main.go +++ b/work2/renbaoshuo/fzu/main.go @@ -1,14 +1,18 @@ package main import ( + "database/sql" "fmt" + "log" "net/http" + "os" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" + _ "github.com/mattn/go-sqlite3" ) const ( @@ -27,12 +31,15 @@ const ( selectorList = `body > div.sy-content > div > div.right.fr > div.list.fl > ul > li` selectorListLink = `a[href^="content.jsp"]` selectorTitle = `body > div.wa1200w > div.conth > form > div.conth1` - selectorContent = `#v_news_content` + selectorContent = `#vsb_content` // 正则 patternTotalPages = `(?s)totalpage=(?P\d+)` patternDate = `(?s)
日期: (?P.*?)  ` patternAuthor = `(?s)信息来源..(?P.*?)\n..` + + // 数据库 + databasePath = "res/fzu.db" ) var ( @@ -156,7 +163,10 @@ func parseArticlePage(html string) (ArticleResult, error) { } title := doc.Find(selectorTitle).Text() - content := doc.Find(selectorContent).Text() + content, err := doc.Find(selectorContent).Html() + if err != nil { + return ArticleResult{}, err + } regexpDate := regexp.MustCompile(patternDate) regexpAuthor := regexp.MustCompile(patternAuthor) @@ -190,7 +200,44 @@ func getArticlePage(urlSuffix string) (ArticleResult, error) { return parseArticlePage(html) } +func initDb() *sql.DB { + os.Remove(databasePath) + + db, err := sql.Open("sqlite3", databasePath) + if err != nil { + log.Fatalln("Error occurred while opening database:", err) + } + + sqlStmt := ` + create table articles (id integer not null primary key autoincrement, title text, content text, author text, post_date text); + ` + + _, err = db.Exec(sqlStmt) + if err != nil { + log.Fatalf("%q: %s\n", err, sqlStmt) + } + + return db +} + +func insertDb(db *sql.DB, article ArticleResult) { + stmt, err := db.Prepare("INSERT INTO articles(title, content, author, post_date) values(?, ?, ?, ?)") + if err != nil { + log.Fatalln("Error occurred while preparing statement:", err) + } + + defer stmt.Close() + + _, err = stmt.Exec(article.Title, article.Content, article.Author, article.PostDate) + if err != nil { + log.Fatalln("Error occurred while executing statement:", err) + } +} + func main() { + db := initDb() + defer db.Close() + totalPages, err := getTotalPages() if err != nil { fmt.Println("Error occurred while getting total pages:", err) @@ -227,10 +274,10 @@ func main() { fmt.Println("Total articles parsed:", len(articles)) - // TODO: Save articles to database - - // print articles basic info + // Save articles to database using SQLite for _, article := range articles { - fmt.Printf("%s\t%s\t\t\t%s\n", article.PostDate, article.Author, article.Title) + insertDb(db, article) } + + fmt.Println("All articles saved to database:", databasePath) } From cafc848aeb35ca7fa93190c438ca7861be819546 Mon Sep 17 00:00:00 2001 From: Baoshuo Date: Tue, 19 Nov 2024 23:06:08 +0800 Subject: [PATCH 3/5] feat: get clicks of article --- work2/renbaoshuo/fzu/main.go | 42 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/work2/renbaoshuo/fzu/main.go b/work2/renbaoshuo/fzu/main.go index 63ab7f4..1e35852 100644 --- a/work2/renbaoshuo/fzu/main.go +++ b/work2/renbaoshuo/fzu/main.go @@ -19,8 +19,9 @@ const ( baseURL = "https://info22.fzu.edu.cn/" listURL = "lm_list.jsp?urltype=tree.TreeTempUrl&wbtreeid=1460" - fmtWithBaseURL = baseURL + "%s" - fmtWithListURL = baseURL + listURL + "&totalpage=%d&PAGENUM=%d" + fmtWithBaseURL = baseURL + "%s" + fmtWithListURL = baseURL + listURL + "&totalpage=%d&PAGENUM=%d" + fmtWithClickURL = baseURL + "system/resource/code/news/click/dynclicks.jsp?clicktype=%s&owner=%s&clickid=%s" // 使用 diff 的方式进行计算,以免页面总数变化对程序的影响(前提:历史文件不删除) startPageDiffWithLastPage = (1028 - 230) @@ -37,6 +38,7 @@ const ( patternTotalPages = `(?s)totalpage=(?P\d+)` patternDate = `(?s)
日期: (?P.*?)  ` patternAuthor = `(?s)信息来源..(?P.*?)\n..` + patternClicks = `_showDynClicks\("(?P\w+)", (?P\d+), (?P\d+)\)` // 数据库 databasePath = "res/fzu.db" @@ -149,11 +151,23 @@ func getListPage(page int, totalPages int) ([]string, error) { return parseListPage(html) } +func getClicks(clickType, owner, clickId string) (int, error) { + url := fmt.Sprintf(fmtWithClickURL, clickType, owner, clickId) + + data, err := httpGet(url) + if err != nil { + return 0, err + } + + return strconv.Atoi(data) +} + type ArticleResult struct { Title string Content string Author string PostDate string // YYYY-MM-DD + Clicks int } func parseArticlePage(html string) (ArticleResult, error) { @@ -170,9 +184,11 @@ func parseArticlePage(html string) (ArticleResult, error) { regexpDate := regexp.MustCompile(patternDate) regexpAuthor := regexp.MustCompile(patternAuthor) + regexpClicks := regexp.MustCompile(patternClicks) matchesDate := regexpDate.FindStringSubmatch(html) matchesAuthor := regexpAuthor.FindStringSubmatch(html) + matchesClicks := regexpClicks.FindStringSubmatch(html) if len(matchesDate) < 2 || len(matchesAuthor) < 2 { return ArticleResult{}, fmt.Errorf("no date or author found") @@ -180,12 +196,21 @@ func parseArticlePage(html string) (ArticleResult, error) { postDate := matchesDate[1] author := matchesAuthor[1] + clickType := matchesClicks[1] + owner := matchesClicks[2] + clickId := matchesClicks[3] + + clicks, err := getClicks(clickType, owner, clickId) + if err != nil { + return ArticleResult{}, err + } return ArticleResult{ Title: title, Author: author, Content: content, PostDate: postDate, + Clicks: clicks, }, nil } @@ -209,7 +234,14 @@ func initDb() *sql.DB { } sqlStmt := ` - create table articles (id integer not null primary key autoincrement, title text, content text, author text, post_date text); + create table articles ( + id integer not null primary key autoincrement, + title text, + content text, + author text, + post_date text, + clicks integer + ); ` _, err = db.Exec(sqlStmt) @@ -221,14 +253,14 @@ func initDb() *sql.DB { } func insertDb(db *sql.DB, article ArticleResult) { - stmt, err := db.Prepare("INSERT INTO articles(title, content, author, post_date) values(?, ?, ?, ?)") + stmt, err := db.Prepare("INSERT INTO articles(title, content, author, post_date, clicks) VALUES(?, ?, ?, ?, ?)") if err != nil { log.Fatalln("Error occurred while preparing statement:", err) } defer stmt.Close() - _, err = stmt.Exec(article.Title, article.Content, article.Author, article.PostDate) + _, err = stmt.Exec(article.Title, article.Content, article.Author, article.PostDate, article.Clicks) if err != nil { log.Fatalln("Error occurred while executing statement:", err) } From 36741eb224470920f66f3f816df5ffbbe41dd5e2 Mon Sep 17 00:00:00 2001 From: Baoshuo Date: Tue, 19 Nov 2024 23:26:04 +0800 Subject: [PATCH 4/5] feat: concurrent fetch --- work2/renbaoshuo/fzu/main.go | 61 ++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/work2/renbaoshuo/fzu/main.go b/work2/renbaoshuo/fzu/main.go index 1e35852..8baeb04 100644 --- a/work2/renbaoshuo/fzu/main.go +++ b/work2/renbaoshuo/fzu/main.go @@ -9,6 +9,7 @@ import ( "regexp" "strconv" "strings" + "sync" "time" "github.com/PuerkitoBio/goquery" @@ -25,7 +26,7 @@ const ( // 使用 diff 的方式进行计算,以免页面总数变化对程序的影响(前提:历史文件不删除) startPageDiffWithLastPage = (1028 - 230) - endPageDiffWithLastPage = (1028 - 347) + endPageDiffWithLastPage = (1028 - 250) // 选择器 selectorTotalPagesLink = `.p_last a` @@ -55,7 +56,6 @@ func httpGet(url string) (string, error) { resp, err := client.Get(url) if err != nil { fmt.Printf("Error occurred while getting %s: %s", url, err) - return "", err } @@ -279,35 +279,56 @@ func main() { fmt.Println("Total pages:", totalPages) urlList := []string{} + var mu sync.Mutex + var wg sync.WaitGroup + sem := make(chan struct{}, 20) // 限制并发 for page := (totalPages - startPageDiffWithLastPage); page <= (totalPages - endPageDiffWithLastPage); page++ { - list, err := getListPage(page, totalPages) - if err != nil { - fmt.Println("Error occurred while getting list page:", err) - return - } - - urlList = append(urlList, list...) + wg.Add(1) + go func(page int) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + list, err := getListPage(page, totalPages) + if err != nil { + fmt.Println("Error occurred while getting list page:", err) + return + } + mu.Lock() + urlList = append(urlList, list...) + mu.Unlock() + }(page) } + wg.Wait() + fmt.Println("Total articles:", len(urlList)) - articles := []ArticleResult{} + articleChan := make(chan ArticleResult, len(urlList)) for _, url := range urlList { - article, err := getArticlePage(url) - if err != nil { - fmt.Println("Error occurred while getting article page:", err) - return - } - - articles = append(articles, article) + wg.Add(1) + go func(url string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + article, err := getArticlePage(url) + if err != nil { + fmt.Println("Error occurred while getting article page:", err) + return + } + articleChan <- article + }(url) } - fmt.Println("Total articles parsed:", len(articles)) + go func() { + wg.Wait() + close(articleChan) + }() - // Save articles to database using SQLite - for _, article := range articles { + for article := range articleChan { insertDb(db, article) } From 55f83ebf1c09491b2a29edbc66dc007b0b307e9b Mon Sep 17 00:00:00 2001 From: Baoshuo Date: Wed, 20 Nov 2024 18:28:41 +0800 Subject: [PATCH 5/5] Apply suggestions from code review --- work2/renbaoshuo/fzu/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/work2/renbaoshuo/fzu/main.go b/work2/renbaoshuo/fzu/main.go index 8baeb04..5aed556 100644 --- a/work2/renbaoshuo/fzu/main.go +++ b/work2/renbaoshuo/fzu/main.go @@ -55,7 +55,7 @@ func httpGet(url string) (string, error) { resp, err := client.Get(url) if err != nil { - fmt.Printf("Error occurred while getting %s: %s", url, err) + fmt.Printf("Error occurred while performing HTTP GET request to %s: %v", url, err) return "", err } @@ -230,7 +230,7 @@ func initDb() *sql.DB { db, err := sql.Open("sqlite3", databasePath) if err != nil { - log.Fatalln("Error occurred while opening database:", err) + log.Fatalf("Error occurred while opening database '%s': %v", databasePath, err) } sqlStmt := `