Black Lives Matter. Support the Equal Justice Initiative.

Source file src/regexp/syntax/regexp.go

Documentation: regexp/syntax

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Note to implementers:
     8  // In this package, re is always a *Regexp and r is always a rune.
     9  
    10  import (
    11  	"strconv"
    12  	"strings"
    13  	"unicode"
    14  )
    15  
    16  // A Regexp is a node in a regular expression syntax tree.
    17  type Regexp struct {
    18  	Op       Op // operator
    19  	Flags    Flags
    20  	Sub      []*Regexp  // subexpressions, if any
    21  	Sub0     [1]*Regexp // storage for short Sub
    22  	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    23  	Rune0    [2]rune    // storage for short Rune
    24  	Min, Max int        // min, max for OpRepeat
    25  	Cap      int        // capturing index, for OpCapture
    26  	Name     string     // capturing name, for OpCapture
    27  }
    28  
    29  //go:generate stringer -type Op -trimprefix Op
    30  
    31  // An Op is a single regular expression operator.
    32  type Op uint8
    33  
    34  // Operators are listed in precedence order, tightest binding to weakest.
    35  // Character class operators are listed simplest to most complex
    36  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    37  
    38  const (
    39  	OpNoMatch        Op = 1 + iota // matches no strings
    40  	OpEmptyMatch                   // matches empty string
    41  	OpLiteral                      // matches Runes sequence
    42  	OpCharClass                    // matches Runes interpreted as range pair list
    43  	OpAnyCharNotNL                 // matches any character except newline
    44  	OpAnyChar                      // matches any character
    45  	OpBeginLine                    // matches empty string at beginning of line
    46  	OpEndLine                      // matches empty string at end of line
    47  	OpBeginText                    // matches empty string at beginning of text
    48  	OpEndText                      // matches empty string at end of text
    49  	OpWordBoundary                 // matches word boundary `\b`
    50  	OpNoWordBoundary               // matches word non-boundary `\B`
    51  	OpCapture                      // capturing subexpression with index Cap, optional name Name
    52  	OpStar                         // matches Sub[0] zero or more times
    53  	OpPlus                         // matches Sub[0] one or more times
    54  	OpQuest                        // matches Sub[0] zero or one times
    55  	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    56  	OpConcat                       // matches concatenation of Subs
    57  	OpAlternate                    // matches alternation of Subs
    58  )
    59  
    60  const opPseudo Op = 128 // where pseudo-ops start
    61  
    62  // Equal reports whether x and y have identical structure.
    63  func (x *Regexp) Equal(y *Regexp) bool {
    64  	if x == nil || y == nil {
    65  		return x == y
    66  	}
    67  	if x.Op != y.Op {
    68  		return false
    69  	}
    70  	switch x.Op {
    71  	case OpEndText:
    72  		// The parse flags remember whether this is \z or \Z.
    73  		if x.Flags&WasDollar != y.Flags&WasDollar {
    74  			return false
    75  		}
    76  
    77  	case OpLiteral, OpCharClass:
    78  		if len(x.Rune) != len(y.Rune) {
    79  			return false
    80  		}
    81  		for i, r := range x.Rune {
    82  			if r != y.Rune[i] {
    83  				return false
    84  			}
    85  		}
    86  
    87  	case OpAlternate, OpConcat:
    88  		if len(x.Sub) != len(y.Sub) {
    89  			return false
    90  		}
    91  		for i, sub := range x.Sub {
    92  			if !sub.Equal(y.Sub[i]) {
    93  				return false
    94  			}
    95  		}
    96  
    97  	case OpStar, OpPlus, OpQuest:
    98  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    99  			return false
   100  		}
   101  
   102  	case OpRepeat:
   103  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   104  			return false
   105  		}
   106  
   107  	case OpCapture:
   108  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   109  			return false
   110  		}
   111  	}
   112  	return true
   113  }
   114  
   115  // writeRegexp writes the Perl syntax for the regular expression re to b.
   116  func writeRegexp(b *strings.Builder, re *Regexp) {
   117  	switch re.Op {
   118  	default:
   119  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   120  	case OpNoMatch:
   121  		b.WriteString(`[^\x00-\x{10FFFF}]`)
   122  	case OpEmptyMatch:
   123  		b.WriteString(`(?:)`)
   124  	case OpLiteral:
   125  		if re.Flags&FoldCase != 0 {
   126  			b.WriteString(`(?i:`)
   127  		}
   128  		for _, r := range re.Rune {
   129  			escape(b, r, false)
   130  		}
   131  		if re.Flags&FoldCase != 0 {
   132  			b.WriteString(`)`)
   133  		}
   134  	case OpCharClass:
   135  		if len(re.Rune)%2 != 0 {
   136  			b.WriteString(`[invalid char class]`)
   137  			break
   138  		}
   139  		b.WriteRune('[')
   140  		if len(re.Rune) == 0 {
   141  			b.WriteString(`^\x00-\x{10FFFF}`)
   142  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
   143  			// Contains 0 and MaxRune. Probably a negated class.
   144  			// Print the gaps.
   145  			b.WriteRune('^')
   146  			for i := 1; i < len(re.Rune)-1; i += 2 {
   147  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   148  				escape(b, lo, lo == '-')
   149  				if lo != hi {
   150  					b.WriteRune('-')
   151  					escape(b, hi, hi == '-')
   152  				}
   153  			}
   154  		} else {
   155  			for i := 0; i < len(re.Rune); i += 2 {
   156  				lo, hi := re.Rune[i], re.Rune[i+1]
   157  				escape(b, lo, lo == '-')
   158  				if lo != hi {
   159  					b.WriteRune('-')
   160  					escape(b, hi, hi == '-')
   161  				}
   162  			}
   163  		}
   164  		b.WriteRune(']')
   165  	case OpAnyCharNotNL:
   166  		b.WriteString(`(?-s:.)`)
   167  	case OpAnyChar:
   168  		b.WriteString(`(?s:.)`)
   169  	case OpBeginLine:
   170  		b.WriteString(`(?m:^)`)
   171  	case OpEndLine:
   172  		b.WriteString(`(?m:$)`)
   173  	case OpBeginText:
   174  		b.WriteString(`\A`)
   175  	case OpEndText:
   176  		if re.Flags&WasDollar != 0 {
   177  			b.WriteString(`(?-m:$)`)
   178  		} else {
   179  			b.WriteString(`\z`)
   180  		}
   181  	case OpWordBoundary:
   182  		b.WriteString(`\b`)
   183  	case OpNoWordBoundary:
   184  		b.WriteString(`\B`)
   185  	case OpCapture:
   186  		if re.Name != "" {
   187  			b.WriteString(`(?P<`)
   188  			b.WriteString(re.Name)
   189  			b.WriteRune('>')
   190  		} else {
   191  			b.WriteRune('(')
   192  		}
   193  		if re.Sub[0].Op != OpEmptyMatch {
   194  			writeRegexp(b, re.Sub[0])
   195  		}
   196  		b.WriteRune(')')
   197  	case OpStar, OpPlus, OpQuest, OpRepeat:
   198  		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   199  			b.WriteString(`(?:`)
   200  			writeRegexp(b, sub)
   201  			b.WriteString(`)`)
   202  		} else {
   203  			writeRegexp(b, sub)
   204  		}
   205  		switch re.Op {
   206  		case OpStar:
   207  			b.WriteRune('*')
   208  		case OpPlus:
   209  			b.WriteRune('+')
   210  		case OpQuest:
   211  			b.WriteRune('?')
   212  		case OpRepeat:
   213  			b.WriteRune('{')
   214  			b.WriteString(strconv.Itoa(re.Min))
   215  			if re.Max != re.Min {
   216  				b.WriteRune(',')
   217  				if re.Max >= 0 {
   218  					b.WriteString(strconv.Itoa(re.Max))
   219  				}
   220  			}
   221  			b.WriteRune('}')
   222  		}
   223  		if re.Flags&NonGreedy != 0 {
   224  			b.WriteRune('?')
   225  		}
   226  	case OpConcat:
   227  		for _, sub := range re.Sub {
   228  			if sub.Op == OpAlternate {
   229  				b.WriteString(`(?:`)
   230  				writeRegexp(b, sub)
   231  				b.WriteString(`)`)
   232  			} else {
   233  				writeRegexp(b, sub)
   234  			}
   235  		}
   236  	case OpAlternate:
   237  		for i, sub := range re.Sub {
   238  			if i > 0 {
   239  				b.WriteRune('|')
   240  			}
   241  			writeRegexp(b, sub)
   242  		}
   243  	}
   244  }
   245  
   246  func (re *Regexp) String() string {
   247  	var b strings.Builder
   248  	writeRegexp(&b, re)
   249  	return b.String()
   250  }
   251  
   252  const meta = `\.+*?()|[]{}^$`
   253  
   254  func escape(b *strings.Builder, r rune, force bool) {
   255  	if unicode.IsPrint(r) {
   256  		if strings.ContainsRune(meta, r) || force {
   257  			b.WriteRune('\\')
   258  		}
   259  		b.WriteRune(r)
   260  		return
   261  	}
   262  
   263  	switch r {
   264  	case '\a':
   265  		b.WriteString(`\a`)
   266  	case '\f':
   267  		b.WriteString(`\f`)
   268  	case '\n':
   269  		b.WriteString(`\n`)
   270  	case '\r':
   271  		b.WriteString(`\r`)
   272  	case '\t':
   273  		b.WriteString(`\t`)
   274  	case '\v':
   275  		b.WriteString(`\v`)
   276  	default:
   277  		if r < 0x100 {
   278  			b.WriteString(`\x`)
   279  			s := strconv.FormatInt(int64(r), 16)
   280  			if len(s) == 1 {
   281  				b.WriteRune('0')
   282  			}
   283  			b.WriteString(s)
   284  			break
   285  		}
   286  		b.WriteString(`\x{`)
   287  		b.WriteString(strconv.FormatInt(int64(r), 16))
   288  		b.WriteString(`}`)
   289  	}
   290  }
   291  
   292  // MaxCap walks the regexp to find the maximum capture index.
   293  func (re *Regexp) MaxCap() int {
   294  	m := 0
   295  	if re.Op == OpCapture {
   296  		m = re.Cap
   297  	}
   298  	for _, sub := range re.Sub {
   299  		if n := sub.MaxCap(); m < n {
   300  			m = n
   301  		}
   302  	}
   303  	return m
   304  }
   305  
   306  // CapNames walks the regexp to find the names of capturing groups.
   307  func (re *Regexp) CapNames() []string {
   308  	names := make([]string, re.MaxCap()+1)
   309  	re.capNames(names)
   310  	return names
   311  }
   312  
   313  func (re *Regexp) capNames(names []string) {
   314  	if re.Op == OpCapture {
   315  		names[re.Cap] = re.Name
   316  	}
   317  	for _, sub := range re.Sub {
   318  		sub.capNames(names)
   319  	}
   320  }
   321  

View as plain text