1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// The linkcheck command finds missing links in the godoc website. 6// It crawls a URL recursively and notes URLs and URL fragments 7// that it's seen and prints a report of missing links at the end. 8package main 9 10import ( 11 "errors" 12 "flag" 13 "fmt" 14 "io" 15 "log" 16 "net/http" 17 "os" 18 "regexp" 19 "strings" 20 "sync" 21) 22 23var ( 24 root = flag.String("root", "http://localhost:6060", "Root to crawl") 25 verbose = flag.Bool("verbose", false, "verbose") 26) 27 28var wg sync.WaitGroup // outstanding fetches 29var urlq = make(chan string) // URLs to crawl 30 31// urlFrag is a URL and its optional #fragment (without the #) 32type urlFrag struct { 33 url, frag string 34} 35 36var ( 37 mu sync.Mutex 38 crawled = make(map[string]bool) // URL without fragment -> true 39 neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it 40) 41 42var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) 43 44// Owned by crawlLoop goroutine: 45var ( 46 linkSources = make(map[string][]string) // url no fragment -> sources 47 fragExists = make(map[urlFrag]bool) 48 problems []string 49) 50 51func localLinks(body string) (links []string) { 52 seen := map[string]bool{} 53 mv := aRx.FindAllStringSubmatch(body, -1) 54 for _, m := range mv { 55 ref := m[1] 56 if strings.HasPrefix(ref, "/src/") { 57 continue 58 } 59 if !seen[ref] { 60 seen[ref] = true 61 links = append(links, m[1]) 62 } 63 } 64 return 65} 66 67var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) 68 69func pageIDs(body string) (ids []string) { 70 mv := idRx.FindAllStringSubmatch(body, -1) 71 for _, m := range mv { 72 ids = append(ids, m[1]) 73 } 74 return 75} 76 77// url may contain a #fragment, and the fragment is then noted as needing to exist. 78func crawl(url string, sourceURL string) { 79 if strings.Contains(url, "/devel/release") { 80 return 81 } 82 mu.Lock() 83 defer mu.Unlock() 84 if u, frag, ok := strings.Cut(url, "#"); ok { 85 url = u 86 if frag != "" { 87 uf := urlFrag{url, frag} 88 neededFrags[uf] = append(neededFrags[uf], sourceURL) 89 } 90 } 91 if crawled[url] { 92 return 93 } 94 crawled[url] = true 95 96 wg.Add(1) 97 go func() { 98 urlq <- url 99 }() 100} 101 102func addProblem(url, errmsg string) { 103 msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) 104 if *verbose { 105 log.Print(msg) 106 } 107 problems = append(problems, msg) 108} 109 110func crawlLoop() { 111 for url := range urlq { 112 if err := doCrawl(url); err != nil { 113 addProblem(url, err.Error()) 114 } 115 } 116} 117 118func doCrawl(url string) error { 119 defer wg.Done() 120 121 req, err := http.NewRequest("GET", url, nil) 122 if err != nil { 123 return err 124 } 125 res, err := http.DefaultTransport.RoundTrip(req) 126 if err != nil { 127 return err 128 } 129 // Handle redirects. 130 if res.StatusCode/100 == 3 { 131 newURL, err := res.Location() 132 if err != nil { 133 return fmt.Errorf("resolving redirect: %v", err) 134 } 135 if !strings.HasPrefix(newURL.String(), *root) { 136 // Skip off-site redirects. 137 return nil 138 } 139 crawl(newURL.String(), url) 140 return nil 141 } 142 if res.StatusCode != 200 { 143 return errors.New(res.Status) 144 } 145 slurp, err := io.ReadAll(res.Body) 146 res.Body.Close() 147 if err != nil { 148 log.Fatalf("Error reading %s body: %v", url, err) 149 } 150 if *verbose { 151 log.Printf("Len of %s: %d", url, len(slurp)) 152 } 153 body := string(slurp) 154 for _, ref := range localLinks(body) { 155 if *verbose { 156 log.Printf(" links to %s", ref) 157 } 158 dest := *root + ref 159 linkSources[dest] = append(linkSources[dest], url) 160 crawl(dest, url) 161 } 162 for _, id := range pageIDs(body) { 163 if *verbose { 164 log.Printf(" url %s has #%s", url, id) 165 } 166 fragExists[urlFrag{url, id}] = true 167 } 168 return nil 169} 170 171func main() { 172 flag.Parse() 173 174 go crawlLoop() 175 crawl(*root, "") 176 177 wg.Wait() 178 close(urlq) 179 for uf, needers := range neededFrags { 180 if !fragExists[uf] { 181 problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) 182 } 183 } 184 185 for _, s := range problems { 186 fmt.Println(s) 187 } 188 if len(problems) > 0 { 189 os.Exit(1) 190 } 191} 192