1// Copyright 2022 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package comment 6 7import ( 8 "slices" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12) 13 14// A Doc is a parsed Go doc comment. 15type Doc struct { 16 // Content is the sequence of content blocks in the comment. 17 Content []Block 18 19 // Links is the link definitions in the comment. 20 Links []*LinkDef 21} 22 23// A LinkDef is a single link definition. 24type LinkDef struct { 25 Text string // the link text 26 URL string // the link URL 27 Used bool // whether the comment uses the definition 28} 29 30// A Block is block-level content in a doc comment, 31// one of [*Code], [*Heading], [*List], or [*Paragraph]. 32type Block interface { 33 block() 34} 35 36// A Heading is a doc comment heading. 37type Heading struct { 38 Text []Text // the heading text 39} 40 41func (*Heading) block() {} 42 43// A List is a numbered or bullet list. 44// Lists are always non-empty: len(Items) > 0. 45// In a numbered list, every Items[i].Number is a non-empty string. 46// In a bullet list, every Items[i].Number is an empty string. 47type List struct { 48 // Items is the list items. 49 Items []*ListItem 50 51 // ForceBlankBefore indicates that the list must be 52 // preceded by a blank line when reformatting the comment, 53 // overriding the usual conditions. See the BlankBefore method. 54 // 55 // The comment parser sets ForceBlankBefore for any list 56 // that is preceded by a blank line, to make sure 57 // the blank line is preserved when printing. 58 ForceBlankBefore bool 59 60 // ForceBlankBetween indicates that list items must be 61 // separated by blank lines when reformatting the comment, 62 // overriding the usual conditions. See the BlankBetween method. 63 // 64 // The comment parser sets ForceBlankBetween for any list 65 // that has a blank line between any two of its items, to make sure 66 // the blank lines are preserved when printing. 67 ForceBlankBetween bool 68} 69 70func (*List) block() {} 71 72// BlankBefore reports whether a reformatting of the comment 73// should include a blank line before the list. 74// The default rule is the same as for [BlankBetween]: 75// if the list item content contains any blank lines 76// (meaning at least one item has multiple paragraphs) 77// then the list itself must be preceded by a blank line. 78// A preceding blank line can be forced by setting [List].ForceBlankBefore. 79func (l *List) BlankBefore() bool { 80 return l.ForceBlankBefore || l.BlankBetween() 81} 82 83// BlankBetween reports whether a reformatting of the comment 84// should include a blank line between each pair of list items. 85// The default rule is that if the list item content contains any blank lines 86// (meaning at least one item has multiple paragraphs) 87// then list items must themselves be separated by blank lines. 88// Blank line separators can be forced by setting [List].ForceBlankBetween. 89func (l *List) BlankBetween() bool { 90 if l.ForceBlankBetween { 91 return true 92 } 93 for _, item := range l.Items { 94 if len(item.Content) != 1 { 95 // Unreachable for parsed comments today, 96 // since the only way to get multiple item.Content 97 // is multiple paragraphs, which must have been 98 // separated by a blank line. 99 return true 100 } 101 } 102 return false 103} 104 105// A ListItem is a single item in a numbered or bullet list. 106type ListItem struct { 107 // Number is a decimal string in a numbered list 108 // or an empty string in a bullet list. 109 Number string // "1", "2", ...; "" for bullet list 110 111 // Content is the list content. 112 // Currently, restrictions in the parser and printer 113 // require every element of Content to be a *Paragraph. 114 Content []Block // Content of this item. 115} 116 117// A Paragraph is a paragraph of text. 118type Paragraph struct { 119 Text []Text 120} 121 122func (*Paragraph) block() {} 123 124// A Code is a preformatted code block. 125type Code struct { 126 // Text is the preformatted text, ending with a newline character. 127 // It may be multiple lines, each of which ends with a newline character. 128 // It is never empty, nor does it start or end with a blank line. 129 Text string 130} 131 132func (*Code) block() {} 133 134// A Text is text-level content in a doc comment, 135// one of [Plain], [Italic], [*Link], or [*DocLink]. 136type Text interface { 137 text() 138} 139 140// A Plain is a string rendered as plain text (not italicized). 141type Plain string 142 143func (Plain) text() {} 144 145// An Italic is a string rendered as italicized text. 146type Italic string 147 148func (Italic) text() {} 149 150// A Link is a link to a specific URL. 151type Link struct { 152 Auto bool // is this an automatic (implicit) link of a literal URL? 153 Text []Text // text of link 154 URL string // target URL of link 155} 156 157func (*Link) text() {} 158 159// A DocLink is a link to documentation for a Go package or symbol. 160type DocLink struct { 161 Text []Text // text of link 162 163 // ImportPath, Recv, and Name identify the Go package or symbol 164 // that is the link target. The potential combinations of 165 // non-empty fields are: 166 // - ImportPath: a link to another package 167 // - ImportPath, Name: a link to a const, func, type, or var in another package 168 // - ImportPath, Recv, Name: a link to a method in another package 169 // - Name: a link to a const, func, type, or var in this package 170 // - Recv, Name: a link to a method in this package 171 ImportPath string // import path 172 Recv string // receiver type, without any pointer star, for methods 173 Name string // const, func, type, var, or method name 174} 175 176func (*DocLink) text() {} 177 178// A Parser is a doc comment parser. 179// The fields in the struct can be filled in before calling [Parser.Parse] 180// in order to customize the details of the parsing process. 181type Parser struct { 182 // Words is a map of Go identifier words that 183 // should be italicized and potentially linked. 184 // If Words[w] is the empty string, then the word w 185 // is only italicized. Otherwise it is linked, using 186 // Words[w] as the link target. 187 // Words corresponds to the [go/doc.ToHTML] words parameter. 188 Words map[string]string 189 190 // LookupPackage resolves a package name to an import path. 191 // 192 // If LookupPackage(name) returns ok == true, then [name] 193 // (or [name.Sym] or [name.Sym.Method]) 194 // is considered a documentation link to importPath's package docs. 195 // It is valid to return "", true, in which case name is considered 196 // to refer to the current package. 197 // 198 // If LookupPackage(name) returns ok == false, 199 // then [name] (or [name.Sym] or [name.Sym.Method]) 200 // will not be considered a documentation link, 201 // except in the case where name is the full (but single-element) import path 202 // of a package in the standard library, such as in [math] or [io.Reader]. 203 // LookupPackage is still called for such names, 204 // in order to permit references to imports of other packages 205 // with the same package names. 206 // 207 // Setting LookupPackage to nil is equivalent to setting it to 208 // a function that always returns "", false. 209 LookupPackage func(name string) (importPath string, ok bool) 210 211 // LookupSym reports whether a symbol name or method name 212 // exists in the current package. 213 // 214 // If LookupSym("", "Name") returns true, then [Name] 215 // is considered a documentation link for a const, func, type, or var. 216 // 217 // Similarly, if LookupSym("Recv", "Name") returns true, 218 // then [Recv.Name] is considered a documentation link for 219 // type Recv's method Name. 220 // 221 // Setting LookupSym to nil is equivalent to setting it to a function 222 // that always returns false. 223 LookupSym func(recv, name string) (ok bool) 224} 225 226// parseDoc is parsing state for a single doc comment. 227type parseDoc struct { 228 *Parser 229 *Doc 230 links map[string]*LinkDef 231 lines []string 232 lookupSym func(recv, name string) bool 233} 234 235// lookupPkg is called to look up the pkg in [pkg], [pkg.Name], and [pkg.Name.Recv]. 236// If pkg has a slash, it is assumed to be the full import path and is returned with ok = true. 237// 238// Otherwise, pkg is probably a simple package name like "rand" (not "crypto/rand" or "math/rand"). 239// d.LookupPackage provides a way for the caller to allow resolving such names with reference 240// to the imports in the surrounding package. 241// 242// There is one collision between these two cases: single-element standard library names 243// like "math" are full import paths but don't contain slashes. We let d.LookupPackage have 244// the first chance to resolve it, in case there's a different package imported as math, 245// and otherwise we refer to a built-in list of single-element standard library package names. 246func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) { 247 if strings.Contains(pkg, "/") { // assume a full import path 248 if validImportPath(pkg) { 249 return pkg, true 250 } 251 return "", false 252 } 253 if d.LookupPackage != nil { 254 // Give LookupPackage a chance. 255 if path, ok := d.LookupPackage(pkg); ok { 256 return path, true 257 } 258 } 259 return DefaultLookupPackage(pkg) 260} 261 262func isStdPkg(path string) bool { 263 _, ok := slices.BinarySearch(stdPkgs, path) 264 return ok 265} 266 267// DefaultLookupPackage is the default package lookup 268// function, used when [Parser.LookupPackage] is nil. 269// It recognizes names of the packages from the standard 270// library with single-element import paths, such as math, 271// which would otherwise be impossible to name. 272// 273// Note that the go/doc package provides a more sophisticated 274// lookup based on the imports used in the current package. 275func DefaultLookupPackage(name string) (importPath string, ok bool) { 276 if isStdPkg(name) { 277 return name, true 278 } 279 return "", false 280} 281 282// Parse parses the doc comment text and returns the *[Doc] form. 283// Comment markers (/* // and */) in the text must have already been removed. 284func (p *Parser) Parse(text string) *Doc { 285 lines := unindent(strings.Split(text, "\n")) 286 d := &parseDoc{ 287 Parser: p, 288 Doc: new(Doc), 289 links: make(map[string]*LinkDef), 290 lines: lines, 291 lookupSym: func(recv, name string) bool { return false }, 292 } 293 if p.LookupSym != nil { 294 d.lookupSym = p.LookupSym 295 } 296 297 // First pass: break into block structure and collect known links. 298 // The text is all recorded as Plain for now. 299 var prev span 300 for _, s := range parseSpans(lines) { 301 var b Block 302 switch s.kind { 303 default: 304 panic("go/doc/comment: internal error: unknown span kind") 305 case spanList: 306 b = d.list(lines[s.start:s.end], prev.end < s.start) 307 case spanCode: 308 b = d.code(lines[s.start:s.end]) 309 case spanOldHeading: 310 b = d.oldHeading(lines[s.start]) 311 case spanHeading: 312 b = d.heading(lines[s.start]) 313 case spanPara: 314 b = d.paragraph(lines[s.start:s.end]) 315 } 316 if b != nil { 317 d.Content = append(d.Content, b) 318 } 319 prev = s 320 } 321 322 // Second pass: interpret all the Plain text now that we know the links. 323 for _, b := range d.Content { 324 switch b := b.(type) { 325 case *Paragraph: 326 b.Text = d.parseLinkedText(string(b.Text[0].(Plain))) 327 case *List: 328 for _, i := range b.Items { 329 for _, c := range i.Content { 330 p := c.(*Paragraph) 331 p.Text = d.parseLinkedText(string(p.Text[0].(Plain))) 332 } 333 } 334 } 335 } 336 337 return d.Doc 338} 339 340// A span represents a single span of comment lines (lines[start:end]) 341// of an identified kind (code, heading, paragraph, and so on). 342type span struct { 343 start int 344 end int 345 kind spanKind 346} 347 348// A spanKind describes the kind of span. 349type spanKind int 350 351const ( 352 _ spanKind = iota 353 spanCode 354 spanHeading 355 spanList 356 spanOldHeading 357 spanPara 358) 359 360func parseSpans(lines []string) []span { 361 var spans []span 362 363 // The loop may process a line twice: once as unindented 364 // and again forced indented. So the maximum expected 365 // number of iterations is 2*len(lines). The repeating logic 366 // can be subtle, though, and to protect against introduction 367 // of infinite loops in future changes, we watch to see that 368 // we are not looping too much. A panic is better than a 369 // quiet infinite loop. 370 watchdog := 2 * len(lines) 371 372 i := 0 373 forceIndent := 0 374Spans: 375 for { 376 // Skip blank lines. 377 for i < len(lines) && lines[i] == "" { 378 i++ 379 } 380 if i >= len(lines) { 381 break 382 } 383 if watchdog--; watchdog < 0 { 384 panic("go/doc/comment: internal error: not making progress") 385 } 386 387 var kind spanKind 388 start := i 389 end := i 390 if i < forceIndent || indented(lines[i]) { 391 // Indented (or force indented). 392 // Ends before next unindented. (Blank lines are OK.) 393 // If this is an unindented list that we are heuristically treating as indented, 394 // then accept unindented list item lines up to the first blank lines. 395 // The heuristic is disabled at blank lines to contain its effect 396 // to non-gofmt'ed sections of the comment. 397 unindentedListOK := isList(lines[i]) && i < forceIndent 398 i++ 399 for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) { 400 if lines[i] == "" { 401 unindentedListOK = false 402 } 403 i++ 404 } 405 406 // Drop trailing blank lines. 407 end = i 408 for end > start && lines[end-1] == "" { 409 end-- 410 } 411 412 // If indented lines are followed (without a blank line) 413 // by an unindented line ending in a brace, 414 // take that one line too. This fixes the common mistake 415 // of pasting in something like 416 // 417 // func main() { 418 // fmt.Println("hello, world") 419 // } 420 // 421 // and forgetting to indent it. 422 // The heuristic will never trigger on a gofmt'ed comment, 423 // because any gofmt'ed code block or list would be 424 // followed by a blank line or end of comment. 425 if end < len(lines) && strings.HasPrefix(lines[end], "}") { 426 end++ 427 } 428 429 if isList(lines[start]) { 430 kind = spanList 431 } else { 432 kind = spanCode 433 } 434 } else { 435 // Unindented. Ends at next blank or indented line. 436 i++ 437 for i < len(lines) && lines[i] != "" && !indented(lines[i]) { 438 i++ 439 } 440 end = i 441 442 // If unindented lines are followed (without a blank line) 443 // by an indented line that would start a code block, 444 // check whether the final unindented lines 445 // should be left for the indented section. 446 // This can happen for the common mistakes of 447 // unindented code or unindented lists. 448 // The heuristic will never trigger on a gofmt'ed comment, 449 // because any gofmt'ed code block would have a blank line 450 // preceding it after the unindented lines. 451 if i < len(lines) && lines[i] != "" && !isList(lines[i]) { 452 switch { 453 case isList(lines[i-1]): 454 // If the final unindented line looks like a list item, 455 // this may be the first indented line wrap of 456 // a mistakenly unindented list. 457 // Leave all the unindented list items. 458 forceIndent = end 459 end-- 460 for end > start && isList(lines[end-1]) { 461 end-- 462 } 463 464 case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`): 465 // If the final unindented line ended in { or \ 466 // it is probably the start of a misindented code block. 467 // Give the user a single line fix. 468 // Often that's enough; if not, the user can fix the others themselves. 469 forceIndent = end 470 end-- 471 } 472 473 if start == end && forceIndent > start { 474 i = start 475 continue Spans 476 } 477 } 478 479 // Span is either paragraph or heading. 480 if end-start == 1 && isHeading(lines[start]) { 481 kind = spanHeading 482 } else if end-start == 1 && isOldHeading(lines[start], lines, start) { 483 kind = spanOldHeading 484 } else { 485 kind = spanPara 486 } 487 } 488 489 spans = append(spans, span{start, end, kind}) 490 i = end 491 } 492 493 return spans 494} 495 496// indented reports whether line is indented 497// (starts with a leading space or tab). 498func indented(line string) bool { 499 return line != "" && (line[0] == ' ' || line[0] == '\t') 500} 501 502// unindent removes any common space/tab prefix 503// from each line in lines, returning a copy of lines in which 504// those prefixes have been trimmed from each line. 505// It also replaces any lines containing only spaces with blank lines (empty strings). 506func unindent(lines []string) []string { 507 // Trim leading and trailing blank lines. 508 for len(lines) > 0 && isBlank(lines[0]) { 509 lines = lines[1:] 510 } 511 for len(lines) > 0 && isBlank(lines[len(lines)-1]) { 512 lines = lines[:len(lines)-1] 513 } 514 if len(lines) == 0 { 515 return nil 516 } 517 518 // Compute and remove common indentation. 519 prefix := leadingSpace(lines[0]) 520 for _, line := range lines[1:] { 521 if !isBlank(line) { 522 prefix = commonPrefix(prefix, leadingSpace(line)) 523 } 524 } 525 526 out := make([]string, len(lines)) 527 for i, line := range lines { 528 line = strings.TrimPrefix(line, prefix) 529 if strings.TrimSpace(line) == "" { 530 line = "" 531 } 532 out[i] = line 533 } 534 for len(out) > 0 && out[0] == "" { 535 out = out[1:] 536 } 537 for len(out) > 0 && out[len(out)-1] == "" { 538 out = out[:len(out)-1] 539 } 540 return out 541} 542 543// isBlank reports whether s is a blank line. 544func isBlank(s string) bool { 545 return len(s) == 0 || (len(s) == 1 && s[0] == '\n') 546} 547 548// commonPrefix returns the longest common prefix of a and b. 549func commonPrefix(a, b string) string { 550 i := 0 551 for i < len(a) && i < len(b) && a[i] == b[i] { 552 i++ 553 } 554 return a[0:i] 555} 556 557// leadingSpace returns the longest prefix of s consisting of spaces and tabs. 558func leadingSpace(s string) string { 559 i := 0 560 for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 561 i++ 562 } 563 return s[:i] 564} 565 566// isOldHeading reports whether line is an old-style section heading. 567// line is all[off]. 568func isOldHeading(line string, all []string, off int) bool { 569 if off <= 0 || all[off-1] != "" || off+2 >= len(all) || all[off+1] != "" || leadingSpace(all[off+2]) != "" { 570 return false 571 } 572 573 line = strings.TrimSpace(line) 574 575 // a heading must start with an uppercase letter 576 r, _ := utf8.DecodeRuneInString(line) 577 if !unicode.IsLetter(r) || !unicode.IsUpper(r) { 578 return false 579 } 580 581 // it must end in a letter or digit: 582 r, _ = utf8.DecodeLastRuneInString(line) 583 if !unicode.IsLetter(r) && !unicode.IsDigit(r) { 584 return false 585 } 586 587 // exclude lines with illegal characters. we allow "()," 588 if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") { 589 return false 590 } 591 592 // allow "'" for possessive "'s" only 593 for b := line; ; { 594 var ok bool 595 if _, b, ok = strings.Cut(b, "'"); !ok { 596 break 597 } 598 if b != "s" && !strings.HasPrefix(b, "s ") { 599 return false // ' not followed by s and then end-of-word 600 } 601 } 602 603 // allow "." when followed by non-space 604 for b := line; ; { 605 var ok bool 606 if _, b, ok = strings.Cut(b, "."); !ok { 607 break 608 } 609 if b == "" || strings.HasPrefix(b, " ") { 610 return false // not followed by non-space 611 } 612 } 613 614 return true 615} 616 617// oldHeading returns the *Heading for the given old-style section heading line. 618func (d *parseDoc) oldHeading(line string) Block { 619 return &Heading{Text: []Text{Plain(strings.TrimSpace(line))}} 620} 621 622// isHeading reports whether line is a new-style section heading. 623func isHeading(line string) bool { 624 return len(line) >= 2 && 625 line[0] == '#' && 626 (line[1] == ' ' || line[1] == '\t') && 627 strings.TrimSpace(line) != "#" 628} 629 630// heading returns the *Heading for the given new-style section heading line. 631func (d *parseDoc) heading(line string) Block { 632 return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}} 633} 634 635// code returns a code block built from the lines. 636func (d *parseDoc) code(lines []string) *Code { 637 body := unindent(lines) 638 body = append(body, "") // to get final \n from Join 639 return &Code{Text: strings.Join(body, "\n")} 640} 641 642// paragraph returns a paragraph block built from the lines. 643// If the lines are link definitions, paragraph adds them to d and returns nil. 644func (d *parseDoc) paragraph(lines []string) Block { 645 // Is this a block of known links? Handle. 646 var defs []*LinkDef 647 for _, line := range lines { 648 def, ok := parseLink(line) 649 if !ok { 650 goto NoDefs 651 } 652 defs = append(defs, def) 653 } 654 for _, def := range defs { 655 d.Links = append(d.Links, def) 656 if d.links[def.Text] == nil { 657 d.links[def.Text] = def 658 } 659 } 660 return nil 661NoDefs: 662 663 return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}} 664} 665 666// parseLink parses a single link definition line: 667// 668// [text]: url 669// 670// It returns the link definition and whether the line was well formed. 671func parseLink(line string) (*LinkDef, bool) { 672 if line == "" || line[0] != '[' { 673 return nil, false 674 } 675 i := strings.Index(line, "]:") 676 if i < 0 || i+3 >= len(line) || (line[i+2] != ' ' && line[i+2] != '\t') { 677 return nil, false 678 } 679 680 text := line[1:i] 681 url := strings.TrimSpace(line[i+3:]) 682 j := strings.Index(url, "://") 683 if j < 0 || !isScheme(url[:j]) { 684 return nil, false 685 } 686 687 // Line has right form and has valid scheme://. 688 // That's good enough for us - we are not as picky 689 // about the characters beyond the :// as we are 690 // when extracting inline URLs from text. 691 return &LinkDef{Text: text, URL: url}, true 692} 693 694// list returns a list built from the indented lines, 695// using forceBlankBefore as the value of the List's ForceBlankBefore field. 696func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List { 697 num, _, _ := listMarker(lines[0]) 698 var ( 699 list *List = &List{ForceBlankBefore: forceBlankBefore} 700 item *ListItem 701 text []string 702 ) 703 flush := func() { 704 if item != nil { 705 if para := d.paragraph(text); para != nil { 706 item.Content = append(item.Content, para) 707 } 708 } 709 text = nil 710 } 711 712 for _, line := range lines { 713 if n, after, ok := listMarker(line); ok && (n != "") == (num != "") { 714 // start new list item 715 flush() 716 717 item = &ListItem{Number: n} 718 list.Items = append(list.Items, item) 719 line = after 720 } 721 line = strings.TrimSpace(line) 722 if line == "" { 723 list.ForceBlankBetween = true 724 flush() 725 continue 726 } 727 text = append(text, strings.TrimSpace(line)) 728 } 729 flush() 730 return list 731} 732 733// listMarker parses the line as beginning with a list marker. 734// If it can do that, it returns the numeric marker ("" for a bullet list), 735// the rest of the line, and ok == true. 736// Otherwise, it returns "", "", false. 737func listMarker(line string) (num, rest string, ok bool) { 738 line = strings.TrimSpace(line) 739 if line == "" { 740 return "", "", false 741 } 742 743 // Can we find a marker? 744 if r, n := utf8.DecodeRuneInString(line); r == '•' || r == '*' || r == '+' || r == '-' { 745 num, rest = "", line[n:] 746 } else if '0' <= line[0] && line[0] <= '9' { 747 n := 1 748 for n < len(line) && '0' <= line[n] && line[n] <= '9' { 749 n++ 750 } 751 if n >= len(line) || (line[n] != '.' && line[n] != ')') { 752 return "", "", false 753 } 754 num, rest = line[:n], line[n+1:] 755 } else { 756 return "", "", false 757 } 758 759 if !indented(rest) || strings.TrimSpace(rest) == "" { 760 return "", "", false 761 } 762 763 return num, rest, true 764} 765 766// isList reports whether the line is the first line of a list, 767// meaning starts with a list marker after any indentation. 768// (The caller is responsible for checking the line is indented, as appropriate.) 769func isList(line string) bool { 770 _, _, ok := listMarker(line) 771 return ok 772} 773 774// parseLinkedText parses text that is allowed to contain explicit links, 775// such as [math.Sin] or [Go home page], into a slice of Text items. 776// 777// A “pkg” is only assumed to be a full import path if it starts with 778// a domain name (a path element with a dot) or is one of the packages 779// from the standard library (“[os]”, “[encoding/json]”, and so on). 780// To avoid problems with maps, generics, and array types, doc links 781// must be both preceded and followed by punctuation, spaces, tabs, 782// or the start or end of a line. An example problem would be treating 783// map[ast.Expr]TypeAndValue as containing a link. 784func (d *parseDoc) parseLinkedText(text string) []Text { 785 var out []Text 786 wrote := 0 787 flush := func(i int) { 788 if wrote < i { 789 out = d.parseText(out, text[wrote:i], true) 790 wrote = i 791 } 792 } 793 794 start := -1 795 var buf []byte 796 for i := 0; i < len(text); i++ { 797 c := text[i] 798 if c == '\n' || c == '\t' { 799 c = ' ' 800 } 801 switch c { 802 case '[': 803 start = i 804 case ']': 805 if start >= 0 { 806 if def, ok := d.links[string(buf)]; ok { 807 def.Used = true 808 flush(start) 809 out = append(out, &Link{ 810 Text: d.parseText(nil, text[start+1:i], false), 811 URL: def.URL, 812 }) 813 wrote = i + 1 814 } else if link, ok := d.docLink(text[start+1:i], text[:start], text[i+1:]); ok { 815 flush(start) 816 link.Text = d.parseText(nil, text[start+1:i], false) 817 out = append(out, link) 818 wrote = i + 1 819 } 820 } 821 start = -1 822 buf = buf[:0] 823 } 824 if start >= 0 && i != start { 825 buf = append(buf, c) 826 } 827 } 828 829 flush(len(text)) 830 return out 831} 832 833// docLink parses text, which was found inside [ ] brackets, 834// as a doc link if possible, returning the DocLink and ok == true 835// or else nil, false. 836// The before and after strings are the text before the [ and after the ] 837// on the same line. Doc links must be preceded and followed by 838// punctuation, spaces, tabs, or the start or end of a line. 839func (d *parseDoc) docLink(text, before, after string) (link *DocLink, ok bool) { 840 if before != "" { 841 r, _ := utf8.DecodeLastRuneInString(before) 842 if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' { 843 return nil, false 844 } 845 } 846 if after != "" { 847 r, _ := utf8.DecodeRuneInString(after) 848 if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' { 849 return nil, false 850 } 851 } 852 text = strings.TrimPrefix(text, "*") 853 pkg, name, ok := splitDocName(text) 854 var recv string 855 if ok { 856 pkg, recv, _ = splitDocName(pkg) 857 } 858 if pkg != "" { 859 if pkg, ok = d.lookupPkg(pkg); !ok { 860 return nil, false 861 } 862 } else { 863 if ok = d.lookupSym(recv, name); !ok { 864 return nil, false 865 } 866 } 867 link = &DocLink{ 868 ImportPath: pkg, 869 Recv: recv, 870 Name: name, 871 } 872 return link, true 873} 874 875// If text is of the form before.Name, where Name is a capitalized Go identifier, 876// then splitDocName returns before, name, true. 877// Otherwise it returns text, "", false. 878func splitDocName(text string) (before, name string, foundDot bool) { 879 i := strings.LastIndex(text, ".") 880 name = text[i+1:] 881 if !isName(name) { 882 return text, "", false 883 } 884 if i >= 0 { 885 before = text[:i] 886 } 887 return before, name, true 888} 889 890// parseText parses s as text and returns the result of appending 891// those parsed Text elements to out. 892// parseText does not handle explicit links like [math.Sin] or [Go home page]: 893// those are handled by parseLinkedText. 894// If autoLink is true, then parseText recognizes URLs and words from d.Words 895// and converts those to links as appropriate. 896func (d *parseDoc) parseText(out []Text, s string, autoLink bool) []Text { 897 var w strings.Builder 898 wrote := 0 899 writeUntil := func(i int) { 900 w.WriteString(s[wrote:i]) 901 wrote = i 902 } 903 flush := func(i int) { 904 writeUntil(i) 905 if w.Len() > 0 { 906 out = append(out, Plain(w.String())) 907 w.Reset() 908 } 909 } 910 for i := 0; i < len(s); { 911 t := s[i:] 912 if autoLink { 913 if url, ok := autoURL(t); ok { 914 flush(i) 915 // Note: The old comment parser would look up the URL in words 916 // and replace the target with words[URL] if it was non-empty. 917 // That would allow creating links that display as one URL but 918 // when clicked go to a different URL. Not sure what the point 919 // of that is, so we're not doing that lookup here. 920 out = append(out, &Link{Auto: true, Text: []Text{Plain(url)}, URL: url}) 921 i += len(url) 922 wrote = i 923 continue 924 } 925 if id, ok := ident(t); ok { 926 url, italics := d.Words[id] 927 if !italics { 928 i += len(id) 929 continue 930 } 931 flush(i) 932 if url == "" { 933 out = append(out, Italic(id)) 934 } else { 935 out = append(out, &Link{Auto: true, Text: []Text{Italic(id)}, URL: url}) 936 } 937 i += len(id) 938 wrote = i 939 continue 940 } 941 } 942 switch { 943 case strings.HasPrefix(t, "``"): 944 if len(t) >= 3 && t[2] == '`' { 945 // Do not convert `` inside ```, in case people are mistakenly writing Markdown. 946 i += 3 947 for i < len(t) && t[i] == '`' { 948 i++ 949 } 950 break 951 } 952 writeUntil(i) 953 w.WriteRune('“') 954 i += 2 955 wrote = i 956 case strings.HasPrefix(t, "''"): 957 writeUntil(i) 958 w.WriteRune('”') 959 i += 2 960 wrote = i 961 default: 962 i++ 963 } 964 } 965 flush(len(s)) 966 return out 967} 968 969// autoURL checks whether s begins with a URL that should be hyperlinked. 970// If so, it returns the URL, which is a prefix of s, and ok == true. 971// Otherwise it returns "", false. 972// The caller should skip over the first len(url) bytes of s 973// before further processing. 974func autoURL(s string) (url string, ok bool) { 975 // Find the ://. Fast path to pick off non-URL, 976 // since we call this at every position in the string. 977 // The shortest possible URL is ftp://x, 7 bytes. 978 var i int 979 switch { 980 case len(s) < 7: 981 return "", false 982 case s[3] == ':': 983 i = 3 984 case s[4] == ':': 985 i = 4 986 case s[5] == ':': 987 i = 5 988 case s[6] == ':': 989 i = 6 990 default: 991 return "", false 992 } 993 if i+3 > len(s) || s[i:i+3] != "://" { 994 return "", false 995 } 996 997 // Check valid scheme. 998 if !isScheme(s[:i]) { 999 return "", false 1000 } 1001 1002 // Scan host part. Must have at least one byte, 1003 // and must start and end in non-punctuation. 1004 i += 3 1005 if i >= len(s) || !isHost(s[i]) || isPunct(s[i]) { 1006 return "", false 1007 } 1008 i++ 1009 end := i 1010 for i < len(s) && isHost(s[i]) { 1011 if !isPunct(s[i]) { 1012 end = i + 1 1013 } 1014 i++ 1015 } 1016 i = end 1017 1018 // At this point we are definitely returning a URL (scheme://host). 1019 // We just have to find the longest path we can add to it. 1020 // Heuristics abound. 1021 // We allow parens, braces, and brackets, 1022 // but only if they match (#5043, #22285). 1023 // We allow .,:;?! in the path but not at the end, 1024 // to avoid end-of-sentence punctuation (#18139, #16565). 1025 stk := []byte{} 1026 end = i 1027Path: 1028 for ; i < len(s); i++ { 1029 if isPunct(s[i]) { 1030 continue 1031 } 1032 if !isPath(s[i]) { 1033 break 1034 } 1035 switch s[i] { 1036 case '(': 1037 stk = append(stk, ')') 1038 case '{': 1039 stk = append(stk, '}') 1040 case '[': 1041 stk = append(stk, ']') 1042 case ')', '}', ']': 1043 if len(stk) == 0 || stk[len(stk)-1] != s[i] { 1044 break Path 1045 } 1046 stk = stk[:len(stk)-1] 1047 } 1048 if len(stk) == 0 { 1049 end = i + 1 1050 } 1051 } 1052 1053 return s[:end], true 1054} 1055 1056// isScheme reports whether s is a recognized URL scheme. 1057// Note that if strings of new length (beyond 3-7) 1058// are added here, the fast path at the top of autoURL will need updating. 1059func isScheme(s string) bool { 1060 switch s { 1061 case "file", 1062 "ftp", 1063 "gopher", 1064 "http", 1065 "https", 1066 "mailto", 1067 "nntp": 1068 return true 1069 } 1070 return false 1071} 1072 1073// isHost reports whether c is a byte that can appear in a URL host, 1074// like www.example.com or user@[::1]:8080 1075func isHost(c byte) bool { 1076 // mask is a 128-bit bitmap with 1s for allowed bytes, 1077 // so that the byte c can be tested with a shift and an and. 1078 // If c > 128, then 1<<c and 1<<(c-64) will both be zero, 1079 // and this function will return false. 1080 const mask = 0 | 1081 (1<<26-1)<<'A' | 1082 (1<<26-1)<<'a' | 1083 (1<<10-1)<<'0' | 1084 1<<'_' | 1085 1<<'@' | 1086 1<<'-' | 1087 1<<'.' | 1088 1<<'[' | 1089 1<<']' | 1090 1<<':' 1091 1092 return ((uint64(1)<<c)&(mask&(1<<64-1)) | 1093 (uint64(1)<<(c-64))&(mask>>64)) != 0 1094} 1095 1096// isPunct reports whether c is a punctuation byte that can appear 1097// inside a path but not at the end. 1098func isPunct(c byte) bool { 1099 // mask is a 128-bit bitmap with 1s for allowed bytes, 1100 // so that the byte c can be tested with a shift and an and. 1101 // If c > 128, then 1<<c and 1<<(c-64) will both be zero, 1102 // and this function will return false. 1103 const mask = 0 | 1104 1<<'.' | 1105 1<<',' | 1106 1<<':' | 1107 1<<';' | 1108 1<<'?' | 1109 1<<'!' 1110 1111 return ((uint64(1)<<c)&(mask&(1<<64-1)) | 1112 (uint64(1)<<(c-64))&(mask>>64)) != 0 1113} 1114 1115// isPath reports whether c is a (non-punctuation) path byte. 1116func isPath(c byte) bool { 1117 // mask is a 128-bit bitmap with 1s for allowed bytes, 1118 // so that the byte c can be tested with a shift and an and. 1119 // If c > 128, then 1<<c and 1<<(c-64) will both be zero, 1120 // and this function will return false. 1121 const mask = 0 | 1122 (1<<26-1)<<'A' | 1123 (1<<26-1)<<'a' | 1124 (1<<10-1)<<'0' | 1125 1<<'$' | 1126 1<<'\'' | 1127 1<<'(' | 1128 1<<')' | 1129 1<<'*' | 1130 1<<'+' | 1131 1<<'&' | 1132 1<<'#' | 1133 1<<'=' | 1134 1<<'@' | 1135 1<<'~' | 1136 1<<'_' | 1137 1<<'/' | 1138 1<<'-' | 1139 1<<'[' | 1140 1<<']' | 1141 1<<'{' | 1142 1<<'}' | 1143 1<<'%' 1144 1145 return ((uint64(1)<<c)&(mask&(1<<64-1)) | 1146 (uint64(1)<<(c-64))&(mask>>64)) != 0 1147} 1148 1149// isName reports whether s is a capitalized Go identifier (like Name). 1150func isName(s string) bool { 1151 t, ok := ident(s) 1152 if !ok || t != s { 1153 return false 1154 } 1155 r, _ := utf8.DecodeRuneInString(s) 1156 return unicode.IsUpper(r) 1157} 1158 1159// ident checks whether s begins with a Go identifier. 1160// If so, it returns the identifier, which is a prefix of s, and ok == true. 1161// Otherwise it returns "", false. 1162// The caller should skip over the first len(id) bytes of s 1163// before further processing. 1164func ident(s string) (id string, ok bool) { 1165 // Scan [\pL_][\pL_0-9]* 1166 n := 0 1167 for n < len(s) { 1168 if c := s[n]; c < utf8.RuneSelf { 1169 if isIdentASCII(c) && (n > 0 || c < '0' || c > '9') { 1170 n++ 1171 continue 1172 } 1173 break 1174 } 1175 r, nr := utf8.DecodeRuneInString(s[n:]) 1176 if unicode.IsLetter(r) { 1177 n += nr 1178 continue 1179 } 1180 break 1181 } 1182 return s[:n], n > 0 1183} 1184 1185// isIdentASCII reports whether c is an ASCII identifier byte. 1186func isIdentASCII(c byte) bool { 1187 // mask is a 128-bit bitmap with 1s for allowed bytes, 1188 // so that the byte c can be tested with a shift and an and. 1189 // If c > 128, then 1<<c and 1<<(c-64) will both be zero, 1190 // and this function will return false. 1191 const mask = 0 | 1192 (1<<26-1)<<'A' | 1193 (1<<26-1)<<'a' | 1194 (1<<10-1)<<'0' | 1195 1<<'_' 1196 1197 return ((uint64(1)<<c)&(mask&(1<<64-1)) | 1198 (uint64(1)<<(c-64))&(mask>>64)) != 0 1199} 1200 1201// validImportPath reports whether path is a valid import path. 1202// It is a lightly edited copy of golang.org/x/mod/module.CheckImportPath. 1203func validImportPath(path string) bool { 1204 if !utf8.ValidString(path) { 1205 return false 1206 } 1207 if path == "" { 1208 return false 1209 } 1210 if path[0] == '-' { 1211 return false 1212 } 1213 if strings.Contains(path, "//") { 1214 return false 1215 } 1216 if path[len(path)-1] == '/' { 1217 return false 1218 } 1219 elemStart := 0 1220 for i, r := range path { 1221 if r == '/' { 1222 if !validImportPathElem(path[elemStart:i]) { 1223 return false 1224 } 1225 elemStart = i + 1 1226 } 1227 } 1228 return validImportPathElem(path[elemStart:]) 1229} 1230 1231func validImportPathElem(elem string) bool { 1232 if elem == "" || elem[0] == '.' || elem[len(elem)-1] == '.' { 1233 return false 1234 } 1235 for i := 0; i < len(elem); i++ { 1236 if !importPathOK(elem[i]) { 1237 return false 1238 } 1239 } 1240 return true 1241} 1242 1243func importPathOK(c byte) bool { 1244 // mask is a 128-bit bitmap with 1s for allowed bytes, 1245 // so that the byte c can be tested with a shift and an and. 1246 // If c > 128, then 1<<c and 1<<(c-64) will both be zero, 1247 // and this function will return false. 1248 const mask = 0 | 1249 (1<<26-1)<<'A' | 1250 (1<<26-1)<<'a' | 1251 (1<<10-1)<<'0' | 1252 1<<'-' | 1253 1<<'.' | 1254 1<<'~' | 1255 1<<'_' | 1256 1<<'+' 1257 1258 return ((uint64(1)<<c)&(mask&(1<<64-1)) | 1259 (uint64(1)<<(c-64))&(mask>>64)) != 0 1260} 1261