1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The linkcheck command finds missing links in the godoc website.
6// It crawls a URL recursively and notes URLs and URL fragments
7// that it's seen and prints a report of missing links at the end.
8package main
9
10import (
11	"errors"
12	"flag"
13	"fmt"
14	"io"
15	"log"
16	"net/http"
17	"os"
18	"regexp"
19	"strings"
20	"sync"
21)
22
23var (
24	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
25	verbose = flag.Bool("verbose", false, "verbose")
26)
27
28var wg sync.WaitGroup        // outstanding fetches
29var urlq = make(chan string) // URLs to crawl
30
31// urlFrag is a URL and its optional #fragment (without the #)
32type urlFrag struct {
33	url, frag string
34}
35
36var (
37	mu          sync.Mutex
38	crawled     = make(map[string]bool)      // URL without fragment -> true
39	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
40)
41
42var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
43
44// Owned by crawlLoop goroutine:
45var (
46	linkSources = make(map[string][]string) // url no fragment -> sources
47	fragExists  = make(map[urlFrag]bool)
48	problems    []string
49)
50
51func localLinks(body string) (links []string) {
52	seen := map[string]bool{}
53	mv := aRx.FindAllStringSubmatch(body, -1)
54	for _, m := range mv {
55		ref := m[1]
56		if strings.HasPrefix(ref, "/src/") {
57			continue
58		}
59		if !seen[ref] {
60			seen[ref] = true
61			links = append(links, m[1])
62		}
63	}
64	return
65}
66
67var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
68
69func pageIDs(body string) (ids []string) {
70	mv := idRx.FindAllStringSubmatch(body, -1)
71	for _, m := range mv {
72		ids = append(ids, m[1])
73	}
74	return
75}
76
77// url may contain a #fragment, and the fragment is then noted as needing to exist.
78func crawl(url string, sourceURL string) {
79	if strings.Contains(url, "/devel/release") {
80		return
81	}
82	mu.Lock()
83	defer mu.Unlock()
84	if u, frag, ok := strings.Cut(url, "#"); ok {
85		url = u
86		if frag != "" {
87			uf := urlFrag{url, frag}
88			neededFrags[uf] = append(neededFrags[uf], sourceURL)
89		}
90	}
91	if crawled[url] {
92		return
93	}
94	crawled[url] = true
95
96	wg.Add(1)
97	go func() {
98		urlq <- url
99	}()
100}
101
102func addProblem(url, errmsg string) {
103	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
104	if *verbose {
105		log.Print(msg)
106	}
107	problems = append(problems, msg)
108}
109
110func crawlLoop() {
111	for url := range urlq {
112		if err := doCrawl(url); err != nil {
113			addProblem(url, err.Error())
114		}
115	}
116}
117
118func doCrawl(url string) error {
119	defer wg.Done()
120
121	req, err := http.NewRequest("GET", url, nil)
122	if err != nil {
123		return err
124	}
125	res, err := http.DefaultTransport.RoundTrip(req)
126	if err != nil {
127		return err
128	}
129	// Handle redirects.
130	if res.StatusCode/100 == 3 {
131		newURL, err := res.Location()
132		if err != nil {
133			return fmt.Errorf("resolving redirect: %v", err)
134		}
135		if !strings.HasPrefix(newURL.String(), *root) {
136			// Skip off-site redirects.
137			return nil
138		}
139		crawl(newURL.String(), url)
140		return nil
141	}
142	if res.StatusCode != 200 {
143		return errors.New(res.Status)
144	}
145	slurp, err := io.ReadAll(res.Body)
146	res.Body.Close()
147	if err != nil {
148		log.Fatalf("Error reading %s body: %v", url, err)
149	}
150	if *verbose {
151		log.Printf("Len of %s: %d", url, len(slurp))
152	}
153	body := string(slurp)
154	for _, ref := range localLinks(body) {
155		if *verbose {
156			log.Printf("  links to %s", ref)
157		}
158		dest := *root + ref
159		linkSources[dest] = append(linkSources[dest], url)
160		crawl(dest, url)
161	}
162	for _, id := range pageIDs(body) {
163		if *verbose {
164			log.Printf(" url %s has #%s", url, id)
165		}
166		fragExists[urlFrag{url, id}] = true
167	}
168	return nil
169}
170
171func main() {
172	flag.Parse()
173
174	go crawlLoop()
175	crawl(*root, "")
176
177	wg.Wait()
178	close(urlq)
179	for uf, needers := range neededFrags {
180		if !fragExists[uf] {
181			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
182		}
183	}
184
185	for _, s := range problems {
186		fmt.Println(s)
187	}
188	if len(problems) > 0 {
189		os.Exit(1)
190	}
191}
192