Black Lives Matter. Support the Equal Justice Initiative.

Source file src/go/doc/comment.go

Documentation: go/doc

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Godoc comment extraction and comment -> HTML formatting.
     6  
     7  package doc
     8  
     9  import (
    10  	"bytes"
    11  	"internal/lazyregexp"
    12  	"io"
    13  	"strings"
    14  	"text/template" // for HTMLEscape
    15  	"unicode"
    16  	"unicode/utf8"
    17  )
    18  
    19  const (
    20  	ldquo = "“"
    21  	rdquo = "”"
    22  	ulquo = "“"
    23  	urquo = "”"
    24  )
    25  
    26  var (
    27  	htmlQuoteReplacer    = strings.NewReplacer(ulquo, ldquo, urquo, rdquo)
    28  	unicodeQuoteReplacer = strings.NewReplacer("``", ulquo, "''", urquo)
    29  )
    30  
    31  // Escape comment text for HTML. If nice is set,
    32  // also turn `` into “ and '' into ”.
    33  func commentEscape(w io.Writer, text string, nice bool) {
    34  	if nice {
    35  		// In the first pass, we convert `` and '' into their unicode equivalents.
    36  		// This prevents them from being escaped in HTMLEscape.
    37  		text = convertQuotes(text)
    38  		var buf bytes.Buffer
    39  		template.HTMLEscape(&buf, []byte(text))
    40  		// Now we convert the unicode quotes to their HTML escaped entities to maintain old behavior.
    41  		// We need to use a temp buffer to read the string back and do the conversion,
    42  		// otherwise HTMLEscape will escape & to &
    43  		htmlQuoteReplacer.WriteString(w, buf.String())
    44  		return
    45  	}
    46  	template.HTMLEscape(w, []byte(text))
    47  }
    48  
    49  func convertQuotes(text string) string {
    50  	return unicodeQuoteReplacer.Replace(text)
    51  }
    52  
    53  const (
    54  	// Regexp for Go identifiers
    55  	identRx = `[\pL_][\pL_0-9]*`
    56  
    57  	// Regexp for URLs
    58  	// Match parens, and check later for balance - see #5043, #22285
    59  	// Match .,:;?! within path, but not at end - see #18139, #16565
    60  	// This excludes some rare yet valid urls ending in common punctuation
    61  	// in order to allow sentences ending in URLs.
    62  
    63  	// protocol (required) e.g. http
    64  	protoPart = `(https?|ftp|file|gopher|mailto|nntp)`
    65  	// host (required) e.g. www.example.com or [::1]:8080
    66  	hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)`
    67  	// path+query+fragment (optional) e.g. /path/index.html?q=foo#bar
    68  	pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*`
    69  
    70  	urlRx = protoPart + `://` + hostPart + pathPart
    71  )
    72  
    73  var matchRx = lazyregexp.New(`(` + urlRx + `)|(` + identRx + `)`)
    74  
    75  var (
    76  	html_a      = []byte(`<a href="`)
    77  	html_aq     = []byte(`">`)
    78  	html_enda   = []byte("</a>")
    79  	html_i      = []byte("<i>")
    80  	html_endi   = []byte("</i>")
    81  	html_p      = []byte("<p>\n")
    82  	html_endp   = []byte("</p>\n")
    83  	html_pre    = []byte("<pre>")
    84  	html_endpre = []byte("</pre>\n")
    85  	html_h      = []byte(`<h3 id="`)
    86  	html_hq     = []byte(`">`)
    87  	html_endh   = []byte("</h3>\n")
    88  )
    89  
    90  // Emphasize and escape a line of text for HTML. URLs are converted into links;
    91  // if the URL also appears in the words map, the link is taken from the map (if
    92  // the corresponding map value is the empty string, the URL is not converted
    93  // into a link). Go identifiers that appear in the words map are italicized; if
    94  // the corresponding map value is not the empty string, it is considered a URL
    95  // and the word is converted into a link. If nice is set, the remaining text's
    96  // appearance is improved where it makes sense (e.g., `` is turned into &ldquo;
    97  // and '' into &rdquo;).
    98  func emphasize(w io.Writer, line string, words map[string]string, nice bool) {
    99  	for {
   100  		m := matchRx.FindStringSubmatchIndex(line)
   101  		if m == nil {
   102  			break
   103  		}
   104  		// m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx)
   105  
   106  		// write text before match
   107  		commentEscape(w, line[0:m[0]], nice)
   108  
   109  		// adjust match for URLs
   110  		match := line[m[0]:m[1]]
   111  		if strings.Contains(match, "://") {
   112  			m0, m1 := m[0], m[1]
   113  			for _, s := range []string{"()", "{}", "[]"} {
   114  				open, close := s[:1], s[1:] // E.g., "(" and ")"
   115  				// require opening parentheses before closing parentheses (#22285)
   116  				if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
   117  					m1 = m0 + i
   118  					match = line[m0:m1]
   119  				}
   120  				// require balanced pairs of parentheses (#5043)
   121  				for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
   122  					m1 = strings.LastIndexAny(line[:m1], s)
   123  					match = line[m0:m1]
   124  				}
   125  			}
   126  			if m1 != m[1] {
   127  				// redo matching with shortened line for correct indices
   128  				m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
   129  			}
   130  		}
   131  
   132  		// analyze match
   133  		url := ""
   134  		italics := false
   135  		if words != nil {
   136  			url, italics = words[match]
   137  		}
   138  		if m[2] >= 0 {
   139  			// match against first parenthesized sub-regexp; must be match against urlRx
   140  			if !italics {
   141  				// no alternative URL in words list, use match instead
   142  				url = match
   143  			}
   144  			italics = false // don't italicize URLs
   145  		}
   146  
   147  		// write match
   148  		if len(url) > 0 {
   149  			w.Write(html_a)
   150  			template.HTMLEscape(w, []byte(url))
   151  			w.Write(html_aq)
   152  		}
   153  		if italics {
   154  			w.Write(html_i)
   155  		}
   156  		commentEscape(w, match, nice)
   157  		if italics {
   158  			w.Write(html_endi)
   159  		}
   160  		if len(url) > 0 {
   161  			w.Write(html_enda)
   162  		}
   163  
   164  		// advance
   165  		line = line[m[1]:]
   166  	}
   167  	commentEscape(w, line, nice)
   168  }
   169  
   170  func indentLen(s string) int {
   171  	i := 0
   172  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   173  		i++
   174  	}
   175  	return i
   176  }
   177  
   178  func isBlank(s string) bool {
   179  	return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
   180  }
   181  
   182  func commonPrefix(a, b string) string {
   183  	i := 0
   184  	for i < len(a) && i < len(b) && a[i] == b[i] {
   185  		i++
   186  	}
   187  	return a[0:i]
   188  }
   189  
   190  func unindent(block []string) {
   191  	if len(block) == 0 {
   192  		return
   193  	}
   194  
   195  	// compute maximum common white prefix
   196  	prefix := block[0][0:indentLen(block[0])]
   197  	for _, line := range block {
   198  		if !isBlank(line) {
   199  			prefix = commonPrefix(prefix, line[0:indentLen(line)])
   200  		}
   201  	}
   202  	n := len(prefix)
   203  
   204  	// remove
   205  	for i, line := range block {
   206  		if !isBlank(line) {
   207  			block[i] = line[n:]
   208  		}
   209  	}
   210  }
   211  
   212  // heading returns the trimmed line if it passes as a section heading;
   213  // otherwise it returns the empty string.
   214  func heading(line string) string {
   215  	line = strings.TrimSpace(line)
   216  	if len(line) == 0 {
   217  		return ""
   218  	}
   219  
   220  	// a heading must start with an uppercase letter
   221  	r, _ := utf8.DecodeRuneInString(line)
   222  	if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
   223  		return ""
   224  	}
   225  
   226  	// it must end in a letter or digit:
   227  	r, _ = utf8.DecodeLastRuneInString(line)
   228  	if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
   229  		return ""
   230  	}
   231  
   232  	// exclude lines with illegal characters. we allow "(),"
   233  	if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
   234  		return ""
   235  	}
   236  
   237  	// allow "'" for possessive "'s" only
   238  	for b := line; ; {
   239  		i := strings.IndexRune(b, '\'')
   240  		if i < 0 {
   241  			break
   242  		}
   243  		if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') {
   244  			return "" // not followed by "s "
   245  		}
   246  		b = b[i+2:]
   247  	}
   248  
   249  	// allow "." when followed by non-space
   250  	for b := line; ; {
   251  		i := strings.IndexRune(b, '.')
   252  		if i < 0 {
   253  			break
   254  		}
   255  		if i+1 >= len(b) || b[i+1] == ' ' {
   256  			return "" // not followed by non-space
   257  		}
   258  		b = b[i+1:]
   259  	}
   260  
   261  	return line
   262  }
   263  
   264  type op int
   265  
   266  const (
   267  	opPara op = iota
   268  	opHead
   269  	opPre
   270  )
   271  
   272  type block struct {
   273  	op    op
   274  	lines []string
   275  }
   276  
   277  var nonAlphaNumRx = lazyregexp.New(`[^a-zA-Z0-9]`)
   278  
   279  func anchorID(line string) string {
   280  	// Add a "hdr-" prefix to avoid conflicting with IDs used for package symbols.
   281  	return "hdr-" + nonAlphaNumRx.ReplaceAllString(line, "_")
   282  }
   283  
   284  // ToHTML converts comment text to formatted HTML.
   285  // The comment was prepared by DocReader,
   286  // so it is known not to have leading, trailing blank lines
   287  // nor to have trailing spaces at the end of lines.
   288  // The comment markers have already been removed.
   289  //
   290  // Each span of unindented non-blank lines is converted into
   291  // a single paragraph. There is one exception to the rule: a span that
   292  // consists of a single line, is followed by another paragraph span,
   293  // begins with a capital letter, and contains no punctuation
   294  // other than parentheses and commas is formatted as a heading.
   295  //
   296  // A span of indented lines is converted into a <pre> block,
   297  // with the common indent prefix removed.
   298  //
   299  // URLs in the comment text are converted into links; if the URL also appears
   300  // in the words map, the link is taken from the map (if the corresponding map
   301  // value is the empty string, the URL is not converted into a link).
   302  //
   303  // A pair of (consecutive) backticks (`) is converted to a unicode left quote (“), and a pair of (consecutive)
   304  // single quotes (') is converted to a unicode right quote (”).
   305  //
   306  // Go identifiers that appear in the words map are italicized; if the corresponding
   307  // map value is not the empty string, it is considered a URL and the word is converted
   308  // into a link.
   309  func ToHTML(w io.Writer, text string, words map[string]string) {
   310  	for _, b := range blocks(text) {
   311  		switch b.op {
   312  		case opPara:
   313  			w.Write(html_p)
   314  			for _, line := range b.lines {
   315  				emphasize(w, line, words, true)
   316  			}
   317  			w.Write(html_endp)
   318  		case opHead:
   319  			w.Write(html_h)
   320  			id := ""
   321  			for _, line := range b.lines {
   322  				if id == "" {
   323  					id = anchorID(line)
   324  					w.Write([]byte(id))
   325  					w.Write(html_hq)
   326  				}
   327  				commentEscape(w, line, true)
   328  			}
   329  			if id == "" {
   330  				w.Write(html_hq)
   331  			}
   332  			w.Write(html_endh)
   333  		case opPre:
   334  			w.Write(html_pre)
   335  			for _, line := range b.lines {
   336  				emphasize(w, line, nil, false)
   337  			}
   338  			w.Write(html_endpre)
   339  		}
   340  	}
   341  }
   342  
   343  func blocks(text string) []block {
   344  	var (
   345  		out  []block
   346  		para []string
   347  
   348  		lastWasBlank   = false
   349  		lastWasHeading = false
   350  	)
   351  
   352  	close := func() {
   353  		if para != nil {
   354  			out = append(out, block{opPara, para})
   355  			para = nil
   356  		}
   357  	}
   358  
   359  	lines := strings.SplitAfter(text, "\n")
   360  	unindent(lines)
   361  	for i := 0; i < len(lines); {
   362  		line := lines[i]
   363  		if isBlank(line) {
   364  			// close paragraph
   365  			close()
   366  			i++
   367  			lastWasBlank = true
   368  			continue
   369  		}
   370  		if indentLen(line) > 0 {
   371  			// close paragraph
   372  			close()
   373  
   374  			// count indented or blank lines
   375  			j := i + 1
   376  			for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) {
   377  				j++
   378  			}
   379  			// but not trailing blank lines
   380  			for j > i && isBlank(lines[j-1]) {
   381  				j--
   382  			}
   383  			pre := lines[i:j]
   384  			i = j
   385  
   386  			unindent(pre)
   387  
   388  			// put those lines in a pre block
   389  			out = append(out, block{opPre, pre})
   390  			lastWasHeading = false
   391  			continue
   392  		}
   393  
   394  		if lastWasBlank && !lastWasHeading && i+2 < len(lines) &&
   395  			isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 {
   396  			// current line is non-blank, surrounded by blank lines
   397  			// and the next non-blank line is not indented: this
   398  			// might be a heading.
   399  			if head := heading(line); head != "" {
   400  				close()
   401  				out = append(out, block{opHead, []string{head}})
   402  				i += 2
   403  				lastWasHeading = true
   404  				continue
   405  			}
   406  		}
   407  
   408  		// open paragraph
   409  		lastWasBlank = false
   410  		lastWasHeading = false
   411  		para = append(para, lines[i])
   412  		i++
   413  	}
   414  	close()
   415  
   416  	return out
   417  }
   418  
   419  // ToText prepares comment text for presentation in textual output.
   420  // It wraps paragraphs of text to width or fewer Unicode code points
   421  // and then prefixes each line with the indent. In preformatted sections
   422  // (such as program text), it prefixes each non-blank line with preIndent.
   423  //
   424  // A pair of (consecutive) backticks (`) is converted to a unicode left quote (“), and a pair of (consecutive)
   425  // single quotes (') is converted to a unicode right quote (”).
   426  func ToText(w io.Writer, text string, indent, preIndent string, width int) {
   427  	l := lineWrapper{
   428  		out:    w,
   429  		width:  width,
   430  		indent: indent,
   431  	}
   432  	for _, b := range blocks(text) {
   433  		switch b.op {
   434  		case opPara:
   435  			// l.write will add leading newline if required
   436  			for _, line := range b.lines {
   437  				line = convertQuotes(line)
   438  				l.write(line)
   439  			}
   440  			l.flush()
   441  		case opHead:
   442  			w.Write(nl)
   443  			for _, line := range b.lines {
   444  				line = convertQuotes(line)
   445  				l.write(line + "\n")
   446  			}
   447  			l.flush()
   448  		case opPre:
   449  			w.Write(nl)
   450  			for _, line := range b.lines {
   451  				if isBlank(line) {
   452  					w.Write([]byte("\n"))
   453  				} else {
   454  					w.Write([]byte(preIndent))
   455  					w.Write([]byte(line))
   456  				}
   457  			}
   458  		}
   459  	}
   460  }
   461  
   462  type lineWrapper struct {
   463  	out       io.Writer
   464  	printed   bool
   465  	width     int
   466  	indent    string
   467  	n         int
   468  	pendSpace int
   469  }
   470  
   471  var nl = []byte("\n")
   472  var space = []byte(" ")
   473  var prefix = []byte("// ")
   474  
   475  func (l *lineWrapper) write(text string) {
   476  	if l.n == 0 && l.printed {
   477  		l.out.Write(nl) // blank line before new paragraph
   478  	}
   479  	l.printed = true
   480  
   481  	needsPrefix := false
   482  	isComment := strings.HasPrefix(text, "//")
   483  	for _, f := range strings.Fields(text) {
   484  		w := utf8.RuneCountInString(f)
   485  		// wrap if line is too long
   486  		if l.n > 0 && l.n+l.pendSpace+w > l.width {
   487  			l.out.Write(nl)
   488  			l.n = 0
   489  			l.pendSpace = 0
   490  			needsPrefix = isComment && !strings.HasPrefix(f, "//")
   491  		}
   492  		if l.n == 0 {
   493  			l.out.Write([]byte(l.indent))
   494  		}
   495  		if needsPrefix {
   496  			l.out.Write(prefix)
   497  			needsPrefix = false
   498  		}
   499  		l.out.Write(space[:l.pendSpace])
   500  		l.out.Write([]byte(f))
   501  		l.n += l.pendSpace + w
   502  		l.pendSpace = 1
   503  	}
   504  }
   505  
   506  func (l *lineWrapper) flush() {
   507  	if l.n == 0 {
   508  		return
   509  	}
   510  	l.out.Write(nl)
   511  	l.pendSpace = 0
   512  	l.n = 0
   513  }
   514  

View as plain text