diff options
Diffstat (limited to 'src/regexp/syntax/regexp.go')
| -rw-r--r-- | src/regexp/syntax/regexp.go | 217 |
1 files changed, 187 insertions, 30 deletions
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go index 3a4d2d201c..4fa7d0e2f8 100644 --- a/src/regexp/syntax/regexp.go +++ b/src/regexp/syntax/regexp.go @@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool { return true } +// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. +type printFlags uint8 + +const ( + flagI printFlags = 1 << iota // (?i: + flagM // (?m: + flagS // (?s: + flagOff // ) + flagPrec // (?: ) + negShift = 5 // flagI<<negShift is (?-i: +) + +// addSpan enables the flags f around start..last, +// by setting flags[start] = f and flags[last] = flagOff. +func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { + if *flags == nil { + *flags = make(map[*Regexp]printFlags) + } + (*flags)[start] = f + (*flags)[last] |= flagOff // maybe start==last +} + +// calcFlags calculates the flags to print around each subexpression in re, +// storing that information in (*flags)[sub] for each affected subexpression. +// The first time an entry needs to be written to *flags, calcFlags allocates the map. +// calcFlags also calculates the flags that must be active or can't be active +// around re and returns those flags. +func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { + switch re.Op { + default: + return 0, 0 + + case OpLiteral: + // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) + // according to whether (?i) is active. + // If literal is not fold-sensitive, return 0, 0. + for _, r := range re.Rune { + if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { + if re.Flags&FoldCase != 0 { + return flagI, 0 + } else { + return 0, flagI + } + } + } + return 0, 0 + + case OpCharClass: + // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. + // If literal is not fold-sensitive, return 0, 0. + for i := 0; i < len(re.Rune); i += 2 { + lo := max(minFold, re.Rune[i]) + hi := min(maxFold, re.Rune[i+1]) + for r := lo; r <= hi; r++ { + for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { + if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { + return 0, flagI + } + } + } + } + return 0, 0 + + case OpAnyCharNotNL: // (?-s). + return 0, flagS + + case OpAnyChar: // (?s). + return flagS, 0 + + case OpBeginLine, OpEndLine: // (?m)^ (?m)$ + return flagM, 0 + + case OpEndText: + if re.Flags&WasDollar != 0 { // (?-m)$ + return 0, flagM + } + return 0, 0 + + case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: + return calcFlags(re.Sub[0], flags) + + case OpConcat, OpAlternate: + // Gather the must and cant for each subexpression. + // When we find a conflicting subexpression, insert the necessary + // flags around the previously identified span and start over. + var must, cant, allCant printFlags + start := 0 + last := 0 + did := false + for i, sub := range re.Sub { + subMust, subCant := calcFlags(sub, flags) + if must&subCant != 0 || subMust&cant != 0 { + if must != 0 { + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + must = 0 + cant = 0 + start = i + did = true + } + must |= subMust + cant |= subCant + allCant |= subCant + if subMust != 0 { + last = i + } + if must == 0 && start == i { + start++ + } + } + if !did { + // No conflicts: pass the accumulated must and cant upward. + return must, cant + } + if must != 0 { + // Conflicts found; need to finish final span. + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + return 0, allCant + } +} + // writeRegexp writes the Perl syntax for the regular expression re to b. -func writeRegexp(b *strings.Builder, re *Regexp) { +func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { + f |= flags[re] + if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { + // flagPrec is redundant with other flags being added and terminated + f &^= flagPrec + } + if f&^(flagOff|flagPrec) != 0 { + b.WriteString(`(?`) + if f&flagI != 0 { + b.WriteString(`i`) + } + if f&flagM != 0 { + b.WriteString(`m`) + } + if f&flagS != 0 { + b.WriteString(`s`) + } + if f&((flagM|flagS)<<negShift) != 0 { + b.WriteString(`-`) + if f&(flagM<<negShift) != 0 { + b.WriteString(`m`) + } + if f&(flagS<<negShift) != 0 { + b.WriteString(`s`) + } + } + b.WriteString(`:`) + } + if f&flagOff != 0 { + defer b.WriteString(`)`) + } + if f&flagPrec != 0 { + b.WriteString(`(?:`) + defer b.WriteString(`)`) + } + switch re.Op { default: b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") @@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: - if re.Flags&FoldCase != 0 { - b.WriteString(`(?i:`) - } for _, r := range re.Rune { escape(b, r, false) } - if re.Flags&FoldCase != 0 { - b.WriteString(`)`) - } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) @@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } @@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } } b.WriteRune(']') - case OpAnyCharNotNL: - b.WriteString(`(?-s:.)`) - case OpAnyChar: - b.WriteString(`(?s:.)`) + case OpAnyCharNotNL, OpAnyChar: + b.WriteString(`.`) case OpBeginLine: - b.WriteString(`(?m:^)`) + b.WriteString(`^`) case OpEndLine: - b.WriteString(`(?m:$)`) + b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { - b.WriteString(`(?-m:$)`) + b.WriteString(`$`) } else { b.WriteString(`\z`) } @@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { - writeRegexp(b, re.Sub[0]) + writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: - if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p := printFlags(0) + sub := re.Sub[0] + if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + p = flagPrec } + writeRegexp(b, sub, p, flags) + switch re.Op { case OpStar: b.WriteRune('*') @@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) { } case OpConcat: for _, sub := range re.Sub { + p := printFlags(0) if sub.Op == OpAlternate { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p = flagPrec } + writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } - writeRegexp(b, sub) + writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder - writeRegexp(&b, re) + var flags map[*Regexp]printFlags + must, cant := calcFlags(re, &flags) + must |= (cant &^ flagI) << negShift + if must != 0 { + must |= flagOff + } + writeRegexp(&b, re, must, flags) return b.String() } |
